Setting R code chunk options

Loading packages and initializing

# Load required packages
library(readxl)
library(openxlsx)
library(writexl)
library(dplyr)
library(lubridate)
library(ggplot2)
library(cowplot)
library(forecast)  
library(Kendall)
library(tseries)
library(outliers)
library(tidyverse)
library(smooth)
library(trend)
library(kableExtra)
library(tidyr)
library(gt) #install.packages("gt")
library(gridExtra) #install.packages("gridExtra")
library(zoo)
library(imputeTS)


# Check working directory
getwd()
## [1] "C:/Users/thepa/Documents/TSA_FinalProject/Code"

Importing and Wrangling Data

## Raw Data Set: Unemployment Rate by Age (Thousands)
# Import data set
UEAge.Thou <- read_excel(
  path="../Data/Raw/UE_Age(Thousands).xlsx", sheet = "Sheet1", col_names = TRUE)

# Format data set
UEAge.Thou_Processed <- UEAge.Thou %>%
  mutate(
    Month = ym(sub("M", "-", Month)), 
    Age15to24.Thou = as.numeric(`15-24`), 
    Age25above.Thou = as.numeric(`25+`),   
    AgeTotal.Thou = as.numeric(`15+`)) %>% 
  rename(Country="Reference area") %>% 
  select(Country,Month,Age15to24.Thou, Age25above.Thou, AgeTotal.Thou) %>% 
  arrange(Country, Month)

## Raw Data Set: Unemployment Rate by Age (%)
# Import data set
UEAge.Per <- read_excel(
  path="../Data/Raw/UE_Age(%).xlsx", sheet = "Sheet1", col_names = TRUE)

# Format data set
UEAge.Per_Processed <- UEAge.Per %>%
  mutate(
    Month = ym(sub("M", "-", Month)), 
    Age15to24.Per = as.numeric(`15-24`), 
    Age25above.Per = as.numeric(`25+`),   
    AgeTotal.Per = as.numeric(`15+`)) %>% 
  rename(Country="Reference area") %>% 
  select(Country,Month,Age15to24.Per, Age25above.Per, AgeTotal.Per) %>% 
  arrange(Country, Month)

## Raw Data Set: Unemployment Rate by Gender (Thousands)
# Import data set
UEGender.Thou <- read_excel(
  path="../Data/Raw/UE_Gender(Thousands).xlsx", sheet = "Sheet1", col_names = TRUE)

# Format data set
UEGender.Thou_Processed <- UEGender.Thou %>%
  mutate(
    Month = ym(sub("M", "-", Month)), 
    Female.Thou = as.numeric(Female), 
    Male.Thou = as.numeric(Male),   
    Total.Thou = as.numeric(Total)) %>% 
  rename(Country="Reference area") %>% 
  select(Country,Month,Female.Thou, Male.Thou, Total.Thou) %>% 
  arrange(Country, Month)

## Raw Data Set: Unemployment Rate by Gender (%)
# Import data set
UEGender.Per <- read_excel(
  path="../Data/Raw/UE_Gender(%).xlsx", sheet = "Sheet1", col_names = TRUE)

# Format data set
UEGender.Per_Processed <- UEGender.Per %>%
  mutate(
    Month = ym(sub("M", "-", Month)), 
    Female.Per = as.numeric(Female), 
    Male.Per = as.numeric(Male),   
    Total.Per = as.numeric(Total)) %>% 
  rename(Country="Reference area") %>% 
  select(Country,Month,Female.Per, Male.Per, Total.Per) %>% 
  arrange(Country, Month)
# Combine all processed data sets 
UE_Countries <- UEAge.Thou_Processed %>% 
  left_join(UEAge.Per_Processed, by=c("Country", "Month")) %>% 
  left_join(UEGender.Thou_Processed, by=c("Country", "Month")) %>% 
  left_join(UEGender.Per_Processed, by=c("Country", "Month")) 

# Extract Peru Data 
Peru <- UE_Countries %>% 
  filter(Country == "Peru") %>% 
  select(-Country, AgeTotal.Per, AgeTotal.Thou) %>% 
  select(Month, Age15to24.Per, Age25above.Per, Female.Per, Male.Per,
         Total.Per, Age15to24.Thou, Age25above.Thou, Female.Thou, Male.Thou,
         Total.Thou) 

# Check Missing Value 
sum(is.na(Peru))
## [1] 0
# Extract US Data 
US <- UE_Countries %>% 
  filter(Country == "United States of America",
         Month >= as.Date("2001-01-01") & Month <= as.Date("2024-12-01")) %>% 
  select(-Country, AgeTotal.Per, AgeTotal.Thou) %>% 
  select(Month, Age15to24.Per, Age25above.Per, Female.Per, Male.Per,
         Total.Per, Age15to24.Thou, Age25above.Thou, Female.Thou, Male.Thou,
         Total.Thou) 

# Check Missing Value 
sum(is.na(US))
## [1] 0
# Generate global unemployment data using simple average 
UE_Global <- UE_Countries %>%
  group_by(Month) %>%
  summarise(
    Age15to24.Thou = mean(`Age15to24.Thou`, na.rm = TRUE),
    Age25above.Thou = mean(`Age25above.Thou`, na.rm = TRUE), 
    AgeTotal.Thou = mean(`AgeTotal.Thou`, na.rm = TRUE), 
    Age15to24.Per = mean(`Age15to24.Thou`, na.rm = TRUE),
    Age25above.Per = mean(`Age25above.Per`, na.rm = TRUE), 
    AgeTotal.Per = mean(`AgeTotal.Per`, na.rm = TRUE), 
    Female.Thou = mean(`Female.Thou`, na.rm = TRUE),
    Male.Thou = mean(`Male.Thou`, na.rm = TRUE),
    Total.Thou = mean(`Total.Thou`, na.rm = TRUE),
    Female.Per = mean(`Female.Per`, na.rm = TRUE),
    Male.Per = mean(`Male.Per`, na.rm = TRUE),
    Total.Per = mean(`Total.Per`, na.rm = TRUE))

# Generate global unemployment data using weighted average 
UE_Global_Weighted <- UE_Countries %>%
  filter(apply(UE_Countries[, 3:14], 1, function(x) all(!is.na(x)))) %>%  
  group_by(Month) %>%
  summarise(
    Age15to24.Per = sum(Age15to24.Thou) / sum(Age15to24.Thou / Age15to24.Per), 
    Age25above.Per = sum(Age25above.Thou) / sum(Age25above.Thou / Age25above.Per),
    Female.Per = sum(Female.Thou) / sum(Female.Thou / Female.Per),
    Male.Per = sum(Male.Thou) / sum(Male.Thou / Male.Per),
    Total.Per = sum(Total.Thou) / sum(Total.Thou / Total.Per),
    
    Age15to24.Thou = mean(Age15to24.Thou, na.rm = TRUE),
    Age25above.Thou = mean(Age25above.Thou, na.rm = TRUE),
    Female.Thou = mean(Female.Thou, na.rm = TRUE),
    Male.Thou = mean(Male.Thou, na.rm = TRUE),
    Total.Thou = mean(Total.Thou, na.rm = TRUE)
  )

# Generate weighted global unemployment yearly
UE_Global_Yearly <- UE_Global_Weighted %>% 
  mutate(Year = year(Month)) %>%
  group_by(Year) %>% 
  summarize(across(where(is.numeric), mean)) #%>% 
  #filter(Year >= 2000 & Year <= 2024)

Descriptive Statistics

Global Unemployment Status

# Check total no. of countries 
print(length(unique(UE_Countries$Country)))
## [1] 91
# Plot global unemployment rate for simple average vs. weighted average 
UE_Global_Combined <- bind_rows(
  UE_Global %>% mutate(Source = "Simple Average"),
  UE_Global_Weighted %>% mutate(Source = "Weighted Average")
)

ggplot(UE_Global_Combined, aes(x = Month, y = Total.Per, color = Source)) +
  geom_line() +               
  labs(title = "Global Unemployment Rate: Simple Vs. Weighted Average",
       x = "Month", y = "Global Unemployment Rate (%)", color = "Source") 

# Plots
# Global Unemployment Rate 
ggplot(UE_Global_Yearly, aes(x = Year, y = Total.Per)) +
  geom_line() +               
  labs(title = "Global Unemployment Rate",
       x = "Year", y = "Global Unemployment Rate (%)")

# Global Unemployment Rate by Age
ggplot(UE_Global_Yearly, aes(x = Year)) +
  geom_line(aes(y = Age15to24.Per, color = "Age 15-24")) +
  geom_line(aes(y = Age25above.Per, color = "Age 25+")) +
  labs(title = "Global Unemployment Rate by Age",
       x = "Year", y = "Global Unemployment Rate (%)", color = "Age Group") +
  scale_color_manual(values = c("Age 15-24" = "blue", "Age 25+" = "red"))

# Global Unemployment Rate by Sex
ggplot(UE_Global_Yearly, aes(x = Year)) +
  geom_line(aes(y = Female.Per, color = "Female")) +
  geom_line(aes(y = Male.Per, color = "Male")) +
  labs(title = "Global Unemployment Rate by Sex",
       x = "Year", y = "Global Unemployment Rate (%)", color = "Sex Group") +
  scale_color_manual(values = c("Female" = "blue", "Male" = "red"))

Unemployment Status in Peru and United States

Starting in January 2022, Peru revised its methodology for compiling labor statistics. As a result, caution is needed when analyzing employment data across this period.

## Unemployment Status in Peru
p1_PE <- ggplot(Peru, aes(x = Month, y = Total.Thou)) +
  geom_line(color = "blue", alpha = 1) +
  labs(x = "", y = "Unemployment (Thousands)") 

p2_PE <- ggplot(Peru, aes(x = Month, y = Total.Per)) +
  geom_line(color = "red", alpha = 1) +
  labs(x = "Year", y = "Unemployment Rate (%)") 

Peru_LF <- Peru %>% 
  mutate(LF.Part = (Total.Thou / Total.Per) * 100) 

p3_PE <- ggplot(Peru_LF, aes(x = Month, y = LF.Part)) +
  geom_line(color = "orange", alpha = 1) +
  labs(x = "", y = "Laborforce Participation (Thousands)")

grid.arrange(p1_PE, p2_PE, p3_PE, ncol = 3,
             top = "Unemployment Status in Peru")

## US Unemployment Status in US 
p1_US <- ggplot(US, aes(x = Month, y = Total.Thou)) +
  geom_line(color = "blue", alpha = 1) +
  labs(x = "", y = "Unemployment (Thousands)") 

p2_US <- ggplot(US, aes(x = Month, y = Total.Per)) +
  geom_line(color = "red", alpha = 1) +
  labs(x = "Year", y = "Unemployment Rate (%)") 

US_LF <- US %>% 
  mutate(LF.Part = (Total.Thou / Total.Per) * 100) 

p3_US <- ggplot(US_LF, aes(x = Month, y = LF.Part)) +
  geom_line(color = "orange", alpha = 1) +
  labs(x = "", y = "Laborforce Participation (Thousands)")

# Arrange plots side by side
grid.arrange(p1_PE, p2_PE, p3_PE, ncol = 3,
             top = "Unemployment Status in the US")

# Comparison of Unemployment Rate between Peru and US
UE_Comparison <- UE_Countries %>% 
  filter(Country %in% c("Peru", "United States of America")) %>% 
  mutate(Month = as.Date(Month)) %>% 
  filter(Month >= as.Date("2001-01-01") & Month <= as.Date("2024-12-01"))

ggplot(UE_Comparison, aes(x = Month, y = Total.Per, color = Country)) +
  geom_line(size = 0.8) +  
  geom_point(size = 0.8, alpha = 0.7) + 
  theme_gray(base_size = 14) +  
  labs(title = "Unemployment Rate by Countries (2001-2024)",
    x = "Year", y = "Unemployment Rate (%)", color = "Country") +
  theme(plot.title = element_text(face = "bold", size = 16),
    axis.title.y = element_text(size = 14), axis.text = element_text(size = 12),
    legend.position = "right")

# Comparison of Unemployment Rate by Age Group 
PE1 <- ggplot(Peru, aes(x = Month)) +
  geom_line(aes(y = Age15to24.Per, color = "Age 15-24"), size = 0.8) +
  geom_line(aes(y = Age25above.Per, color = "Age 25+"), size = 0.8) +
  geom_point(aes(y = Age15to24.Per, color = "Age 15-24"), size = 0.8, alpha = 0.5) +
  geom_point(aes(y = Age25above.Per, color = "Age 25+"), size = 0.8, alpha = 0.5) +
  labs(title = "Unemployment Rate by Age Group",
    x = "Year", y = "Peru Unemployment Rate (%)", color = "Age Group") +
  scale_color_manual(values = c("Age 15-24" = "blue", "Age 25+" = "red")) +
  theme_gray(base_size = 12) +  
  theme(plot.title = element_text(size = 14),
    axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 12),
    axis.text = element_text(size = 12), legend.position = "bottom") 

US1 <- ggplot(US, aes(x = Month)) +
  geom_line(aes(y = Age15to24.Per, color = "Age 15-24"), size = 0.8) +
  geom_line(aes(y = Age25above.Per, color = "Age 25+"), size = 0.8) +
  geom_point(aes(y = Age15to24.Per, color = "Age 15-24"), size = 0.8, alpha = 0.5) +
  geom_point(aes(y = Age25above.Per, color = "Age 25+"), size = 0.8, alpha = 0.5) +
  labs(title = "",
    x = "Year", y = "US Unemployment Rate (%)", color = "Age Group") +
  scale_color_manual(values = c("Age 15-24" = "blue", "Age 25+" = "red")) +
  theme_gray(base_size = 12) +  
  theme(plot.title = element_text(size = 14),
    axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 12),
    axis.text = element_text(size = 12), legend.position = "bottom")

grid.arrange(PE1, US1, ncol = 2)

# Comparison of Unemployment Rate by Sex Group 
sPE1 <- ggplot(Peru, aes(x = Month)) +
  geom_line(aes(y = Female.Per, color = "Female"), size = 0.8) +
  geom_line(aes(y = Male.Per, color = "Male"), size = 0.8) +
  geom_point(aes(y = Female.Per, color = "Female"), size = 0.8, alpha = 0.5) +
  geom_point(aes(y = Male.Per, color = "Male"), size = 0.8, alpha = 0.5) +
  labs(title = "Unemployment Rate by Sex Group",
    x = "Year", y = "Peru Unemployment Rate (%)", color = "Sex Group") +
  scale_color_manual(values = c("Female" = "blue", "Male" = "red")) +
  theme_gray(base_size = 12) +  
  theme(plot.title = element_text(size = 14),
    axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 12),
    axis.text = element_text(size = 12), legend.position = "bottom") 

sUS1 <- ggplot(US, aes(x = Month)) +
  geom_line(aes(y = Female.Per, color = "Female"), size = 0.8) +
  geom_line(aes(y = Male.Per, color = "Male"), size = 0.8) +
  geom_point(aes(y = Female.Per, color = "Female"), size = 0.8, alpha = 0.5) +
  geom_point(aes(y = Male.Per, color = "Male"), size = 0.8, alpha = 0.5) +
  labs(title = "",
    x = "Year", y = "US Unemployment Rate (%)", color = "Sex Group") +
  scale_color_manual(values = c("Female" = "blue", "Male" = "red")) +
  theme_gray(base_size = 12) +  
  theme(plot.title = element_text(size = 14),
    axis.title.x = element_text(size = 12), axis.title.y = element_text(size = 12),
    axis.text = element_text(size = 12), legend.position = "bottom") 

grid.arrange(sPE1, sUS1, ncol = 2)

Additional graphs(Jisup)

Innitial plot of Peru as the perspective of the level of unemployment

peru_unemployment <- read_excel(
  path="../Data/Raw/Final Project_data_20250403_R.xlsx", sheet = "Peru_unemployment", col_names = TRUE)

# Check for missing values
summary(peru_unemployment)
##    Country              Month                        Age15to24.Thou 
##  Length:281         Min.   :2001-04-01 00:00:00.00   Min.   : 39.3  
##  Class :character   1st Qu.:2007-02-01 00:00:00.00   1st Qu.:136.8  
##  Mode  :character   Median :2013-01-01 00:00:00.00   Median :154.0  
##                     Mean   :2012-12-16 01:42:29.47   Mean   :161.9  
##                     3rd Qu.:2018-11-01 00:00:00.00   3rd Qu.:177.1  
##                     Max.   :2024-09-01 00:00:00.00   Max.   :379.0  
##  Age25above.Thou AgeTotal.Thou    Age15to24.Per   Age25above.Per 
##  Min.   :119.7   Min.   : 191.0   Min.   : 4.90   Min.   :2.600  
##  1st Qu.:169.4   1st Qu.: 315.5   1st Qu.:12.10   1st Qu.:4.000  
##  Median :194.9   Median : 343.2   Median :13.50   Median :5.000  
##  Mean   :238.9   Mean   : 400.8   Mean   :13.16   Mean   :5.244  
##  3rd Qu.:219.0   3rd Qu.: 380.3   3rd Qu.:15.00   3rd Qu.:6.300  
##  Max.   :880.9   Max.   :1229.5   Max.   :18.80   Max.   :8.300  
##   AgeTotal.Per     Female.Thou      Male.Thou       Total.Thou    
##  Min.   : 3.400   Min.   : 65.9   Min.   : 89.9   Min.   : 191.0  
##  1st Qu.: 6.000   1st Qu.:166.0   1st Qu.:142.5   1st Qu.: 315.5  
##  Median : 7.000   Median :184.8   Median :162.0   Median : 343.2  
##  Mean   : 7.043   Mean   :214.6   Mean   :186.2   Mean   : 400.8  
##  3rd Qu.: 8.300   3rd Qu.:208.2   3rd Qu.:182.2   3rd Qu.: 380.3  
##  Max.   :10.800   Max.   :665.5   Max.   :586.0   Max.   :1229.5  
##    Female.Per        Male.Per        Total.Per       ...15        
##  Min.   : 4.000   Min.   : 2.800   Min.   : 3.400   Mode:logical  
##  1st Qu.: 6.800   1st Qu.: 4.900   1st Qu.: 6.000   NA's:281      
##  Median : 8.500   Median : 5.900   Median : 7.000                 
##  Mean   : 8.272   Mean   : 6.025   Mean   : 7.043                 
##  3rd Qu.: 9.900   3rd Qu.: 7.100   3rd Qu.: 8.300                 
##  Max.   :12.800   Max.   :10.000   Max.   :10.800                 
##  Employment_level  ...17         unemployment rate     ...19        
##  Min.   : 1944    Mode:logical   Min.   : 3.965    Min.   :-2.9389  
##  1st Qu.: 3060    NA's:281       1st Qu.: 7.249    1st Qu.:-2.2372  
##  Median : 3506                   Median : 8.654    Median :-1.7115  
##  Mean   : 4722                   Mean   : 8.779    Mean   :-1.7362  
##  3rd Qu.: 3894                   3rd Qu.:10.594    3rd Qu.:-1.3016  
##  Max.   :15557                   Max.   :13.739    Max.   :-0.5654
sum(is.na(peru_unemployment))
## [1] 562
# Check regularity of time index
diff_date <- diff(peru_unemployment$Month)
table(diff_date)  # There's one missing month.
## diff_date
##  28  29  30  31  61 
##  17   6  93 163   1
peru_unemployment$Month <- as.Date(peru_unemployment$Month)


# Create full monthly sequence
full_month_seq <- data.frame(Month = seq.Date(from = min(peru_unemployment$Month),to = max(peru_unemployment$Month),
                                              by = "month"))

# Join with original to find missing months
missing_months <- full_month_seq %>%
  anti_join(peru_unemployment, by = "Month")

print(missing_months)  # Missing month is 2012-09-01
##        Month
## 1 2012-09-01
# Fill in missing months with NA
peru_unemployment <- full_month_seq %>%
  left_join(peru_unemployment, by = "Month")


# Check for outliers visually
boxplot(peru_unemployment$AgeTotal.Thou,
        main = "Boxplot: Peru AgeTotal.Thou",
        horizontal = TRUE,
        col = "lightblue")

boxplot(peru_unemployment$AgeTotal.Per,
        main = "Boxplot: Peru AgeTotal.Thou",
        horizontal = TRUE,
        col = "lightblue")

boxplot(peru_unemployment$Employment_level,
        main = "Boxplot: Peru AgeTotal.Thou",
        horizontal = TRUE,
        col = "lightblue")

# Convert to time series
ts_peru_total_thou <- ts(peru_unemployment$AgeTotal.Thou,
               start = c(2001, 4), 
               frequency = 12)
ts_peru_total_per <- ts(peru_unemployment$AgeTotal.Per,
               start = c(2001, 4), 
               frequency = 12)
ts_peru_employment_thou <- ts(peru_unemployment$Employment_level,
               start = c(2001, 4), 
               frequency = 12)



# interpolation of NA
ts_peru_total_thou <- na_interpolation(ts_peru_total_thou, option = "linear")
ts_peru_total_per <- na_interpolation(ts_peru_total_per, option = "linear")
ts_peru_employment_thou <- na_interpolation(ts_peru_employment_thou, option = "linear")

sum(is.na(ts_peru_total_thou))
## [1] 0
sum(is.na(ts_peru_total_per))
## [1] 0
sum(is.na(ts_peru_employment_thou))
## [1] 0
# plotting the graph
peru_plot_data <- data.frame(
  Month = peru_unemployment$Month,
  Unemployment_Level = as.numeric(ts_peru_total_thou),
  Employment_Level = as.numeric(ts_peru_employment_thou),
  Unemployment_Rate_Percent = as.numeric(ts_peru_total_per)
)


p <- ggplot(peru_plot_data, aes(x = Month)) +
  geom_line(aes(y = Unemployment_Level, color = "Unemployment Level"), size = 1) +
  geom_line(aes(y = Employment_Level, color = "Employment Level"), size = 1) +
  geom_line(aes(y = Unemployment_Rate_Percent * 1000, color = "Unemployment Rate [%]"), size = 1) +
  scale_y_continuous(
    name = "Unemployment [Thousands]",
    limits = c(0, 20000),
    sec.axis = sec_axis(~./1000, name = "Unemployment Rate [%]", breaks = seq(0, 20, 5))
  ) +
  scale_color_manual(
    name = "Legend",
    values = c(
      "Unemployment Level" = "darkblue",
      "Employment Level" = "darkorange",
      "Unemployment Rate [%]" = "gray"
    )
  ) +
  labs(
    title = "Peru: Employment, Unemployment Count and Rate Over Time",
    x = "Month"
  ) +
  theme_minimal()

print(p)

# Decompose the time series
decomp_ts_peru_total_thou <- decompose(ts_peru_total_thou)
plot(decomp_ts_peru_total_thou)

decomp_ts_peru_total_per <- decompose(ts_peru_total_per)
plot(decomp_ts_peru_total_per)

decomp_ts_peru_employment_thou <- decompose(ts_peru_employment_thou)
plot(decomp_ts_peru_employment_thou)

# ACF and PACF plots
par(mfrow = c(1, 2))
Acf(ts_peru_total_thou, lag.max = 40, main = "ACF: Peru AgeTotal.Thou")
Pacf(ts_peru_total_thou, lag.max = 40, main = "PACF: Peru AgeTotal.Thou")

par(mfrow = c(1, 2))
Acf(ts_peru_total_per, lag.max = 40, main = "ACF: Peru AgeTotal.Thou")
Pacf(ts_peru_total_per, lag.max = 40, main = "PACF: Peru AgeTotal.Thou")

par(mfrow = c(1, 2))
Acf(ts_peru_employment_thou, lag.max = 40, main = "ACF: Peru AgeTotal.Thou")
Pacf(ts_peru_employment_thou, lag.max = 40, main = "PACF: Peru AgeTotal.Thou")

# Grubbs test for outliers
grubbs.test(ts_peru_total_thou)
## 
##  Grubbs test for one outlier
## 
## data:  ts_peru_total_thou
## G = 4.25704, U = 0.93528, p-value = 0.002155
## alternative hypothesis: highest value 1229.5 is an outlier
grubbs.test(ts_peru_total_per)
## 
##  Grubbs test for one outlier
## 
## data:  ts_peru_total_per
## G = 2.40929, U = 0.97927, p-value = 1
## alternative hypothesis: highest value 10.8 is an outlier
grubbs.test(ts_peru_employment_thou)
## 
##  Grubbs test for one outlier
## 
## data:  ts_peru_employment_thou
## G = 2.91618, U = 0.96963, p-value = 0.4687
## alternative hypothesis: highest value 15556.571 is an outlier

Additional graphs(Jisup)

Innitial plot of US as the perspective of the level of unemployment

us_unemployment <- read_excel(
  path="../Data/Raw/Final Project_data_20250403_R.xlsx", sheet = "US_unemployment", col_names = TRUE)

# Check for missing values
summary(us_unemployment)
##    Country              Month                         Age15to24.Thou
##  Length:925         Min.   :1948-01-01 00:00:00.000   Min.   : 434  
##  Class :character   1st Qu.:1967-04-01 00:00:00.000   1st Qu.:1497  
##  Mode  :character   Median :1986-07-01 00:00:00.000   Median :2321  
##                     Mean   :1986-07-01 19:21:20.431   Mean   :2298  
##                     3rd Qu.:2005-10-01 00:00:00.000   3rd Qu.:2970  
##                     Max.   :2025-01-01 00:00:00.000   Max.   :5005  
##  Age25above.Thou AgeTotal.Thou   Age15to24.Per   Age25above.Per  
##  Min.   :  960   Min.   : 1480   Min.   : 4.50   Min.   : 1.800  
##  1st Qu.: 2443   1st Qu.: 4127   1st Qu.: 9.60   1st Qu.: 3.300  
##  Median : 4207   Median : 6579   Median :11.30   Median : 4.100  
##  Mean   : 4304   Mean   : 6602   Mean   :11.59   Mean   : 4.414  
##  3rd Qu.: 5371   3rd Qu.: 8170   3rd Qu.:13.40   3rd Qu.: 5.200  
##  Max.   :17687   Max.   :22504   Max.   :26.90   Max.   :12.800  
##   AgeTotal.Per     Female.Thou      Male.Thou       Total.Thou   
##  Min.   : 2.400   Min.   :  502   Min.   :  824   Min.   : 1480  
##  1st Qu.: 4.400   1st Qu.: 1602   1st Qu.: 2470   1st Qu.: 4127  
##  Median : 5.500   Median : 3075   Median : 3562   Median : 6579  
##  Mean   : 5.684   Mean   : 2916   Mean   : 3686   Mean   : 6602  
##  3rd Qu.: 6.700   3rd Qu.: 3687   3rd Qu.: 4512   3rd Qu.: 8170  
##  Max.   :14.400   Max.   :11494   Max.   :11010   Max.   :22504  
##    Female.Per        Male.Per        Total.Per      Employment_level
##  Min.   : 2.600   Min.   : 1.900   Min.   : 2.400   Min.   : 46056  
##  1st Qu.: 4.800   1st Qu.: 4.200   1st Qu.: 4.400   1st Qu.: 59959  
##  Median : 5.700   Median : 5.200   Median : 5.500   Median : 89254  
##  Mean   : 5.934   Mean   : 5.565   Mean   : 5.684   Mean   : 90796  
##  3rd Qu.: 6.900   3rd Qu.: 6.600   3rd Qu.: 6.700   3rd Qu.:121455  
##  Max.   :15.700   Max.   :13.300   Max.   :14.400   Max.   :142843
sum(is.na(us_unemployment))
## [1] 0
# 1. Filtering after 2001
us_unemployment_filtered <- us_unemployment %>%
  filter(Month >= as.Date("2001-01-01"))

# 2. Plot
ggplot(us_unemployment_filtered, aes(x = Month)) +
  geom_line(aes(y = AgeTotal.Thou, color = "Unemployment Level"), size = 1) +
  geom_line(aes(y = Employment_level, color = "Employment Level"), size = 1) +
  geom_line(aes(y = AgeTotal.Per * 7500, color = "Unemployment Rate [%]"), size = 1) +
  scale_y_continuous(
    name = "Employment / Unemployment [Thousands]",
    limits = c(0, 150000), 
    sec.axis = sec_axis(~./7500, name = "Unemployment Rate [%]", breaks = seq(0, 20, 5))
  ) +
  scale_color_manual(
    name = "Legend",
    values = c(
      "Unemployment Level" = "darkblue",
      "Employment Level" = "darkorange",
      "Unemployment Rate [%]" = "gray"
    )
  ) +
  labs(
    title = "US: Employment, Unemployment Count and Rate Since 2001",
    x = "Month"
  ) +
  theme_minimal()

# Check regularity of time index
diff_date <- diff(us_unemployment$Month)
table(diff_date)  # There's one missing month.
## diff_date
##  28  29  30  31 
##  57  20 308 539
us_unemployment$Month <- as.Date(us_unemployment$Month)


# Create full monthly sequence
full_month_seq <- data.frame(Month = seq.Date(from = min(us_unemployment$Month),to = max(us_unemployment$Month),
                                              by = "month"))

# Join with original to find missing months
missing_months <- full_month_seq %>%
  anti_join(us_unemployment, by = "Month")

print(missing_months) # no NAs
## [1] Month
## <0 rows> (or 0-length row.names)
# Check for outliers visually
boxplot(us_unemployment$AgeTotal.Thou,
        main = "Boxplot: US AgeTotal.Thou",
        horizontal = TRUE,
        col = "lightblue")

boxplot(us_unemployment$AgeTotal.Per,
        main = "Boxplot: US AgeTotal.Thou",
        horizontal = TRUE,
        col = "lightblue")

boxplot(us_unemployment$Employment_level,
        main = "Boxplot: US AgeTotal.Thou",
        horizontal = TRUE,
        col = "lightblue")

# Convert to time series
ts_us_total_thou <- ts(us_unemployment$AgeTotal.Thou,
               start = c(2001, 4), 
               frequency = 12)
ts_us_total_per <- ts(us_unemployment$AgeTotal.Per,
               start = c(2001, 4), 
               frequency = 12)
ts_us_employment_thou <- ts(us_unemployment$Employment_level,
               start = c(2001, 4), 
               frequency = 12)

# Decompose the time series
decomp_ts_us_total_thou <- decompose(ts_us_total_thou)
plot(decomp_ts_us_total_thou)

decomp_ts_us_total_per <- decompose(ts_us_total_per)
plot(decomp_ts_us_total_per)

decomp_ts_us_employment_thou <- decompose(ts_us_employment_thou)
plot(decomp_ts_us_employment_thou)

# ACF and PACF plots
par(mfrow = c(1, 2))
Acf(ts_us_total_thou, lag.max = 40, main = "ACF: us AgeTotal.Thou")
Pacf(ts_us_total_thou, lag.max = 40, main = "PACF: us AgeTotal.Thou")

par(mfrow = c(1, 2))
Acf(ts_us_total_per, lag.max = 40, main = "ACF: us AgeTotal.Thou")
Pacf(ts_us_total_per, lag.max = 40, main = "PACF: us AgeTotal.Thou")

par(mfrow = c(1, 2))
Acf(ts_us_employment_thou, lag.max = 40, main = "ACF: us AgeTotal.Thou")
Pacf(ts_us_employment_thou, lag.max = 40, main = "PACF: us AgeTotal.Thou")

# Grubbs test for outliers
grubbs.test(ts_us_total_thou)
## 
##  Grubbs test for one outlier
## 
## data:  ts_us_total_thou
## G = 5.18747, U = 0.97085, p-value = 8.078e-05
## alternative hypothesis: highest value 22504.1 is an outlier
grubbs.test(ts_us_total_per)
## 
##  Grubbs test for one outlier
## 
## data:  ts_us_total_per
## G = 4.98129, U = 0.97312, p-value = 0.0002467
## alternative hypothesis: highest value 14.4 is an outlier
grubbs.test(ts_us_employment_thou)
## 
##  Grubbs test for one outlier
## 
## data:  ts_us_employment_thou
## G = 1.68866, U = 0.99691, p-value = 1
## alternative hypothesis: highest value 142843.355 is an outlier

Additional graphs(Jisup)

Comparing Peru and US(Population and GDP growth)

gdp_growth <- read_excel(
  path="../Data/Raw/Final Project_data_20250403_R.xlsx", sheet = "US_GDP", col_names = TRUE)
gdp_growth <- gdp_growth[244:nrow(gdp_growth), ]


pop_growth <- read_excel(
  path="../Data/Raw/Final Project_data_20250403_R.xlsx", sheet = "US_Population", col_names = TRUE)
pop_growth <- pop_growth[80:nrow(pop_growth), ]


# Check for missing values
summary(gdp_growth)
##  observation_date                
##  Min.   :2007-10-01 00:00:00.00  
##  1st Qu.:2012-01-01 00:00:00.00  
##  Median :2016-04-01 00:00:00.00  
##  Mean   :2016-03-31 21:54:46.96  
##  3rd Qu.:2020-07-01 00:00:00.00  
##  Max.   :2024-10-01 00:00:00.00  
##                                  
##  Real GDP\r\nBillions of Chained 2017 Dollars,\r\nSeasonally Adjusted Annual Rate
##  Min.   :16269                                                                   
##  1st Qu.:17367                                                                   
##  Median :19057                                                                   
##  Mean   :19302                                                                   
##  3rd Qu.:20843                                                                   
##  Max.   :23542                                                                   
##                                                                                  
##  US GDP growth     Real GDP per Capita GDP per Capita  Peru GDP growth  
##  Min.   :-28.100   Min.   :53017       Min.   :46865   Min.   :-29.200  
##  1st Qu.:  1.300   1st Qu.:55490       1st Qu.:51205   1st Qu.:  1.975  
##  Median :  2.500   Median :58704       Median :57705   Median :  3.550  
##  Mean   :  2.148   Mean   :59629       Mean   :60888   Mean   :  3.837  
##  3rd Qu.:  3.500   3rd Qu.:63022       3rd Qu.:66222   3rd Qu.:  5.950  
##  Max.   : 35.200   Max.   :69006       Max.   :87125   Max.   : 41.100  
##                                                        NA's   :1
sum(is.na(gdp_growth))
## [1] 1
summary(pop_growth)
##  observation_date                 US Population      US_pop_growth     
##  Min.   :2008-01-01 00:00:00.00   Min.   :30454300   Min.   :0.002004  
##  1st Qu.:2012-01-01 00:00:00.00   1st Qu.:31472500   1st Qu.:0.005917  
##  Median :2016-01-01 00:00:00.00   Median :32460900   Median :0.007781  
##  Mean   :2016-01-01 08:28:14.11   Mean   :32319418   Mean   :0.007094  
##  3rd Qu.:2020-01-01 00:00:00.00   3rd Qu.:33184000   3rd Qu.:0.008281  
##  Max.   :2024-01-01 00:00:00.00   Max.   :34021200   Max.   :0.009437  
##                                                                        
##  US Labor Participation rate(25+) US Employment-to-population ratio
##  Min.   :62.68                    Min.   :58.61                    
##  1st Qu.:63.73                    1st Qu.:60.95                    
##  Median :64.19                    Median :61.46                    
##  Mean   :64.62                    Mean   :61.37                    
##  3rd Qu.:65.37                    3rd Qu.:61.80                    
##  Max.   :67.37                    Max.   :64.27                    
##                                                                    
##  peru_pop_growth          ...7       ...8              ...9    
##  Min.   :-0.061086   Min.   : NA   Mode:logical   Min.   : NA  
##  1st Qu.:-0.003392   1st Qu.: NA   NA's:17        1st Qu.: NA  
##  Median : 0.003044   Median : NA                  Median : NA  
##  Mean   :-0.002813   Mean   :NaN                  Mean   :NaN  
##  3rd Qu.: 0.004503   3rd Qu.: NA                  3rd Qu.: NA  
##  Max.   : 0.028980   Max.   : NA                  Max.   : NA  
##                      NA's   :17                   NA's   :17
sum(is.na(pop_growth))
## [1] 51
# 2. Plot
ggplot(gdp_growth, aes(x = observation_date)) +
  geom_line(aes(y = `Peru GDP growth`, color = "Peru GDP Growth"), size = 1) +
  geom_line(aes(y = `US GDP growth`, color = "US GDP Growth"), size = 1) +
  scale_color_manual(
    name = "Legend",
    values = c(
      "Peru GDP Growth" = "darkblue",
      "US GDP Growth" = "darkorange"
    )
  ) +
  labs(
    title = "US & Peru: Quarterly GDP Growth Since 2008",
    x = "Date",
    y = "GDP Growth (%)"
  ) +
  theme_minimal()

ggplot(pop_growth, aes(x = observation_date)) +
  geom_line(aes(y = peru_pop_growth, color = "Peru Population Growth"), size = 1) +
  geom_line(aes(y = US_pop_growth, color = "US Population Growth"), size = 1) +
  scale_color_manual(
    name = "Legend",
    values = c(
      "Peru Population Growth" = "darkblue",
      "US Population Growth" = "darkorange"
    )
  ) +
  labs(
    title = "US & Peru: Quarterly Population Growth Since 2008",
    x = "Date",
    y = "Population Growth (%)"
  ) +
  theme_minimal()

Missing month handling

## PERU
# Check regularity of time index
diff_date <- diff(Peru$Month)
table(diff_date)  # There's one missing month.
## diff_date
##  28  29  30  31  61 
##  17   6  93 163   1
# Create full monthly sequence
full_month_seq <- data.frame(Month = seq.Date(from = min(Peru$Month),
                                              to = max(Peru$Month),
                                              by = "month"))

# Join with original to find missing months
missing_months <- full_month_seq %>%
  anti_join(Peru, by = "Month")

print(missing_months)  # Missing month is 2012-09-01
##        Month
## 1 2012-09-01
# Fill in missing months with NA and interpolate linearly
Peru <- full_month_seq %>%
  left_join(Peru, by = "Month") %>% 
  mutate(across(where(is.numeric), ~na.approx(.x, na.rm = FALSE)))

## US 
diff_date <- diff(US$Month)
table(diff_date)  # There's no missing month.
## diff_date
##  28  29  30  31 
##  18   6  96 167

Summary Statistics

## PERU
# Generate Summary Statistics
summary_table <- Peru %>%
  select(-Month) %>%  # Exclude Month column
  summarise(across(where(is.numeric), 
                   list(Mean = ~ mean(.x),
                        SD = ~ sd(.x),
                        Min = ~ min(.x),
                        Max = ~ max(.x),
                        N = ~ sum(!is.na(.x))))) %>%
  pivot_longer(everything(), names_to = c("Variable", ".value"), names_sep = "_") %>% 
  gt() %>%
  tab_header(title = "Summary Statistics of Unemployment Data in Peru",
    subtitle = "Monthly Data (2001-2024)") %>%
  fmt_number(columns = 2:6, decimals = 2) %>%
  cols_label(Variable = "Indicator", Mean = "Mean", SD = "Standard Deviation",
    Min = "Min", Max = "Max", N = "Observations") %>%
  tab_options(table.font.size = px(14),
    heading.title.font.size = px(18), heading.subtitle.font.size = px(14))

# print(summary_table)   # During knitting, this should not be excuted.

summary_table <- Peru %>%
  select(-Month) %>%
  summarise(across(where(is.numeric), 
                   list(Mean = ~ mean(.x),
                        SD = ~ sd(.x),
                        Min = ~ min(.x),
                        Max = ~ max(.x),
                        N = ~ sum(!is.na(.x))))) %>%
  pivot_longer(everything(), names_to = c("Variable", ".value"), names_sep = "_")

knitr::kable(summary_table, digits = 2, caption = "Summary Statistics of Unemployment Data in Peru")
Summary Statistics of Unemployment Data in Peru
Variable Mean SD Min Max N
Age15to24.Per 13.15 2.79 4.9 18.8 282
Age25above.Per 5.24 1.40 2.6 8.3 282
Female.Per 8.27 1.86 4.0 12.8 282
Male.Per 6.02 1.48 2.8 10.0 282
Total.Per 7.04 1.56 3.4 10.8 282
Age15to24.Thou 161.82 54.35 39.3 379.0 282
Age25above.Thou 238.57 150.11 119.7 880.9 282
Female.Thou 214.38 107.78 65.9 665.5 282
Male.Thou 186.00 89.20 89.9 586.0 282
Total.Thou 400.38 194.76 191.0 1229.5 282
# Check outliers 
outlier(Peru) 
##           Month   Age15to24.Per  Age25above.Per      Female.Per        Male.Per 
##         19967.0             4.9             8.3            12.8            10.0 
##       Total.Per  Age15to24.Thou Age25above.Thou     Female.Thou       Male.Thou 
##            10.8           379.0           880.9           665.5           586.0 
##      Total.Thou 
##          1229.5
grubbs.test(Peru$Age15to24.Thou) # This is an outlier. 
## 
##  Grubbs test for one outlier
## 
## data:  Peru$Age15to24.Thou
## G = 3.99614, U = 0.94297, p-value = 0.007181
## alternative hypothesis: highest value 379 is an outlier
grubbs.test(Peru$Age25above.Thou) # This is an outlier. 
## 
##  Grubbs test for one outlier
## 
## data:  Peru$Age25above.Thou
## G = 4.2792, U = 0.9346, p-value = 0.001938
## alternative hypothesis: highest value 880.9 is an outlier
grubbs.test(Peru$Age15to24.Per) 
## 
##  Grubbs test for one outlier
## 
## data:  Peru$Age15to24.Per
## G = 2.96292, U = 0.96865, p-value = 0.4013
## alternative hypothesis: lowest value 4.9 is an outlier
grubbs.test(Peru$Age25above.Per)
## 
##  Grubbs test for one outlier
## 
## data:  Peru$Age25above.Per
## G = 2.17958, U = 0.98303, p-value = 1
## alternative hypothesis: highest value 8.3 is an outlier
grubbs.test(Peru$Female.Thou) # This is an outlier. 
## 
##  Grubbs test for one outlier
## 
## data:  Peru$Female.Thou
## G = 4.18539, U = 0.93744, p-value = 0.003023
## alternative hypothesis: highest value 665.5 is an outlier
grubbs.test(Peru$Male.Thou) # This is an outlier. 
## 
##  Grubbs test for one outlier
## 
## data:  Peru$Male.Thou
## G = 4.48425, U = 0.92818, p-value = 0.0007077
## alternative hypothesis: highest value 586 is an outlier
grubbs.test(Peru$Female.Per)
## 
##  Grubbs test for one outlier
## 
## data:  Peru$Female.Per
## G = 2.43372, U = 0.97885, p-value = 1
## alternative hypothesis: highest value 12.8 is an outlier
grubbs.test(Peru$Male.Per)
## 
##  Grubbs test for one outlier
## 
## data:  Peru$Male.Per
## G = 2.69350, U = 0.97409, p-value = 0.9522
## alternative hypothesis: highest value 10 is an outlier
grubbs.test(Peru$Total.Thou) # This is an outlier. 
## 
##  Grubbs test for one outlier
## 
## data:  Peru$Total.Thou
## G = 4.25704, U = 0.93528, p-value = 0.002155
## alternative hypothesis: highest value 1229.5 is an outlier
grubbs.test(Peru$Total.Per)
## 
##  Grubbs test for one outlier
## 
## data:  Peru$Total.Per
## G = 2.40929, U = 0.97927, p-value = 1
## alternative hypothesis: highest value 10.8 is an outlier
# Check the box plot for total unemployment 
boxplot(Peru$Total.Per,
        main = "Boxplot: Peru Unemployment Rate (%)",
        horizontal = TRUE, 
        col = "lightblue")

# Find the row where Total.Per is the outlier (10.8%)
outlier_row <- Peru %>%
  filter(Total.Per == max(Total.Per, na.rm = TRUE))

print(outlier_row) # Highest Value is 2005-02-01. 
##        Month Age15to24.Per Age25above.Per Female.Per Male.Per Total.Per
## 1 2005-02-01          17.9            8.3       12.8      9.2      10.8
##   Age15to24.Thou Age25above.Thou Female.Thou Male.Thou Total.Thou
## 1          194.2           257.1       239.8     211.5      451.3
## PERU
# Generate Summary Statistics
summary_tableUS <- US %>%
  select(-Month) %>%  # Exclude Month column
  summarise(across(where(is.numeric), 
                   list(Mean = ~ mean(.x),
                        SD = ~ sd(.x),
                        Min = ~ min(.x),
                        Max = ~ max(.x),
                        N = ~ sum(!is.na(.x))))) %>%
  pivot_longer(everything(), names_to = c("Variable", ".value"), names_sep = "_") %>% 
  gt() %>%
  tab_header(title = "Summary Statistics of Unemployment Data in US",
    subtitle = "Monthly Data (2001-2024)") %>%
  fmt_number(columns = 2:6, decimals = 2) %>%
  cols_label(Variable = "Indicator", Mean = "Mean", SD = "Standard Deviation",
    Min = "Min", Max = "Max", N = "Observations") %>%
  tab_options(table.font.size = px(14),
    heading.title.font.size = px(18), heading.subtitle.font.size = px(14))

# print(summary_tableUS) # During knitting, this should not be excuted.

summary_tableUS_simple <- US %>%
  select(-Month) %>%  
  summarise(across(where(is.numeric), 
                   list(Mean = ~ mean(.x),
                        SD = ~ sd(.x),
                        Min = ~ min(.x),
                        Max = ~ max(.x),
                        N = ~ sum(!is.na(.x))))) %>%
  pivot_longer(everything(), names_to = c("Variable", ".value"), names_sep = "_")

knitr::kable(summary_tableUS_simple, digits = 2, caption = "Summary Statistics of Unemployment Data in US")
Summary Statistics of Unemployment Data in US
Variable Mean SD Min Max N
Age15to24.Per 12.01 3.49 5.7 26.9 288
Age25above.Per 4.76 1.77 2.6 12.8 288
Female.Per 5.56 1.88 2.9 15.7 288
Male.Per 5.95 2.15 3.2 13.3 288
Total.Per 5.77 1.98 3.1 14.4 288
Age15to24.Thou 2584.03 739.48 1232.8 4869.8 288
Age25above.Thou 6368.66 2360.28 3711.0 17686.9 288
Female.Thou 4034.55 1344.29 2243.2 11494.3 288
Male.Thou 4918.15 1735.00 2842.0 11009.8 288
Total.Thou 8952.70 3016.38 5146.1 22504.1 288
# Check outliers 
outlier(US) 
##           Month   Age15to24.Per  Age25above.Per      Female.Per        Male.Per 
##         20058.0            26.9            12.8            15.7            13.3 
##       Total.Per  Age15to24.Thou Age25above.Thou     Female.Thou       Male.Thou 
##            14.4          4869.8         17686.9         11494.3         11009.8 
##      Total.Thou 
##         22504.1
grubbs.test(US$Age15to24.Thou) 
## 
##  Grubbs test for one outlier
## 
## data:  US$Age15to24.Thou
## G = 3.09104, U = 0.96659, p-value = 0.2652
## alternative hypothesis: highest value 4869.8 is an outlier
grubbs.test(US$Age25above.Thou) # This is an outlier. 
## 
##  Grubbs test for one outlier
## 
## data:  US$Age25above.Thou
## G = 4.7953, U = 0.9196, p-value = 0.0001438
## alternative hypothesis: highest value 17686.9 is an outlier
grubbs.test(US$Age15to24.Per) # This is an outlier. 
## 
##  Grubbs test for one outlier
## 
## data:  US$Age15to24.Per
## G = 4.26881, U = 0.93628, p-value = 0.002095
## alternative hypothesis: highest value 26.9 is an outlier
grubbs.test(US$Age25above.Per) # This is an outlier. 
## 
##  Grubbs test for one outlier
## 
## data:  US$Age25above.Per
## G = 4.52998, U = 0.92825, p-value = 0.0005784
## alternative hypothesis: highest value 12.8 is an outlier
grubbs.test(US$Female.Thou) # This is an outlier. 
## 
##  Grubbs test for one outlier
## 
## data:  US$Female.Thou
## G = 5.54923, U = 0.89233, p-value = 1.695e-06
## alternative hypothesis: highest value 11494.3 is an outlier
grubbs.test(US$Male.Thou) 
## 
##  Grubbs test for one outlier
## 
## data:  US$Male.Thou
## G = 3.5110, U = 0.9569, p-value = 0.05618
## alternative hypothesis: highest value 11009.8 is an outlier
grubbs.test(US$Female.Per) # This is an outlier. 
## 
##  Grubbs test for one outlier
## 
## data:  US$Female.Per
## G = 5.40402, U = 0.89789, p-value = 4.225e-06
## alternative hypothesis: highest value 15.7 is an outlier
grubbs.test(US$Male.Per)
## 
##  Grubbs test for one outlier
## 
## data:  US$Male.Per
## G = 3.42210, U = 0.95905, p-value = 0.07927
## alternative hypothesis: highest value 13.3 is an outlier
grubbs.test(US$Total.Thou) # This is an outlier. 
## 
##  Grubbs test for one outlier
## 
## data:  US$Total.Thou
## G = 4.49260, U = 0.92943, p-value = 0.0006988
## alternative hypothesis: highest value 22504.1 is an outlier
grubbs.test(US$Total.Per) # This is an outlier. 
## 
##  Grubbs test for one outlier
## 
## data:  US$Total.Per
## G = 4.3545, U = 0.9337, p-value = 0.001386
## alternative hypothesis: highest value 14.4 is an outlier
# Check the box plot for total unemployment 
boxplot(US$Total.Per,
        main = "Boxplot: US Unemployment Rate (%)",
        horizontal = TRUE, 
        col = "lightblue")

# Find the row where Total.Per is the outlier (14.4%)
outlier_rowUS <- US %>%
  filter(Total.Per == max(Total.Per, na.rm = TRUE))

print(outlier_rowUS) # Outlier is 2020-04-01. 
## # A tibble: 1 × 11
##   Month      Age15to24.Per Age25above.Per Female.Per Male.Per Total.Per
##   <date>             <dbl>          <dbl>      <dbl>    <dbl>     <dbl>
## 1 2020-04-01          26.9           12.8       15.7     13.3      14.4
## # ℹ 5 more variables: Age15to24.Thou <dbl>, Age25above.Thou <dbl>,
## #   Female.Thou <dbl>, Male.Thou <dbl>, Total.Thou <dbl>

Time Series Analysis for Peru

Step 1: Transform into time series and set training and testing windows for Peru

# Transform into time series
ts_Peru <- ts(Peru[,2:11],
              start=c(year(Peru$Month[1]), month(Peru$Month[1])),
              frequency = 12)

# Set the period
nobs = nrow(Peru)
n_for = 12

# Create a subset for training purpose 
ts_Peru_train <- ts(Peru[1:(nobs-n_for),2:11],
                    start=c(year(Peru$Month[1]), month(Peru$Month[1])),
                    frequency = 12)

# Create a subset for testing purpose
start_row = nobs - n_for + 1
ts_Peru_test <- ts(Peru[(nobs-n_for+1):nobs,2:11],
                   start=c(year(Peru$Month[start_row]),
                           month(Peru$Month[start_row])), frequency = 12)

# Plots 
train <- autoplot(ts_Peru_train[,"Total.Per"]) + ylab("Unemployment Rate (%)") +
  ggtitle("Training Window")
test <- autoplot(ts_Peru_test[,"Total.Per"]) + ylab("Unemployment Rate (%)") +
  ggtitle("Testing Window")
grid.arrange(train, test, ncol = 2)

par(mfrow=c(1,2))
  Acf(ts_Peru_train[,"Total.Per"], lag=40, plot = TRUE, main = "")
  Pacf(ts_Peru_train[,"Total.Per"], lag=40, plot = TRUE, main = "")

Step 2: Decompose the time series for Peru

# Decompose 
decom_totalper_train <- decompose(ts_Peru_train[,"Total.Per"])
plot(decom_totalper_train)

# Deseason 
deseas_totalper_train <- seasadj(decom_totalper_train)  
plot(deseas_totalper_train)

# Run the tests on deseasoned series
print(adf.test(deseas_totalper_train, alternative = "stationary")) # It is unit root. 
## 
##  Augmented Dickey-Fuller Test
## 
## data:  deseas_totalper_train
## Dickey-Fuller = -2.9546, Lag order = 6, p-value = 0.1739
## alternative hypothesis: stationary
summary(MannKendall(deseas_totalper_train)) # It has a decreasing trend.
## Score =  -24109 , Var(Score) = 2199023
## denominator =  36289.99
## tau = -0.664, 2-sided pvalue =< 2.22e-16
# Run the tests on original series 
print(adf.test(ts_Peru_train[,"Total.Per"], alternative = "stationary")) # It is stationary. 
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_Peru_train[, "Total.Per"]
## Dickey-Fuller = -4.0545, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
summary(SeasonalMannKendall(ts_Peru_train[,"Total.Per"])) 
## Score =  -1992 , Var(Score) = 16095.33
## denominator =  2878.875
## tau = -0.692, 2-sided pvalue =< 2.22e-16
summary(smk.test(ts_Peru_train[,"Total.Per"])) # It has seasonality. 
## 
##  Seasonal Mann-Kendall trend test (Hirsch-Slack test)
## 
## data: ts_Peru_train[, "Total.Per"]
## alternative hypothesis: two.sided
## 
## Statistics for individual seasons
## 
## H0
##                       S   varS    tau      z   Pr(>|z|)    
## Season 1:   S = 0  -154 1249.3 -0.677 -4.329 1.5003e-05 ***
## Season 2:   S = 0  -146 1254.7 -0.636 -4.094 4.2475e-05 ***
## Season 3:   S = 0  -147 1251.7 -0.645 -4.127 3.6792e-05 ***
## Season 4:   S = 0  -171 1427.0 -0.684 -4.500 6.7873e-06 ***
## Season 5:   S = 0  -170 1430.7 -0.676 -4.468 7.8938e-06 ***
## Season 6:   S = 0  -167 1427.0 -0.668 -4.394 1.1110e-05 ***
## Season 7:   S = 0  -182 1430.7 -0.724 -4.785 1.7073e-06 ***
## Season 8:   S = 0  -189 1431.7 -0.750 -4.969 6.7427e-07 ***
## Season 9:   S = 0  -173 1429.7 -0.689 -4.549 5.3915e-06 ***
## Season 10:   S = 0 -158 1254.7 -0.688 -4.432 9.3205e-06 ***
## Season 11:   S = 0 -160 1254.7 -0.697 -4.489 7.1616e-06 ***
## Season 12:   S = 0 -175 1253.7 -0.764 -4.914 8.9118e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Check for any differencing needed 
print(ndiffs(ts_Peru_train[,"Total.Per"]))
## [1] 1
print(ndiffs(deseas_totalper_train))
## [1] 1

Step 3: Test Time Series Models for Peru

# Seasonal Naive Model 
SNAIVE_deseas_totalper <- snaive(ts_Peru_train[,"Total.Per"], h=n_for)
autoplot(SNAIVE_deseas_totalper)

checkresiduals(SNAIVE_deseas_totalper)

## 
##  Ljung-Box test
## 
## data:  Residuals from Seasonal naive method
## Q* = 560.18, df = 24, p-value < 2.2e-16
## 
## Model df: 0.   Total lags used: 24
# Simple Moving Average Model
SMA_deseas_totalper <- smooth::sma(y = deseas_totalper_train, h=n_for, 
                                   holdout = FALSE, silent = FALSE) 
## Order 1 - 263.7536; Order 100 - 795.3006; Order 200 - 936.0019
## Order 1 - 263.7536; Order 50 - 686.2083; Order 100 - 795.3006
## Order 1 - 263.7536; Order 25 - 577.3812; Order 50 - 686.2083
## Order 1 - 263.7536; Order 13 - 477.0378; Order 25 - 577.3812
## Order 1 - 263.7536; Order 7 - 392.1558; Order 13 - 477.0378
## Order 1 - 263.7536; Order 4 - 350.4972; Order 7 - 392.1558
## Order 1 - 263.7536; Order 2 - 287.2054; Order 4 - 350.4972
## Order 12 - 465.9452

summary(SMA_deseas_totalper)
## 
## Model estimated using sma() function: SMA(1)
## Response variable: y
## Distribution used in the estimation: Normal
## Loss function type: MSE; Loss function value: 0.1544
## All coefficients were provided
## Error standard deviation: 0.3936
## Sample size: 270
## Number of estimated parameters: 1
## Number of degrees of freedom: 269
## Information criteria:
##      AIC     AICc      BIC     BICc 
## 263.7387 263.7536 267.3371 267.3789
checkresiduals(SMA_deseas_totalper) # Residuals are not iid.

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 50.942, df = 24, p-value = 0.001073
## 
## Model df: 0.   Total lags used: 24
# Simple Exponential Smoothing Model
SES_deseas_totalper = ses(y = deseas_totalper_train, h=n_for, 
                          holdout = FALSE, silent = FALSE)  
summary(SES_deseas_totalper)
## 
## Forecast method: Simple exponential smoothing
## 
## Model Information:
## Simple exponential smoothing 
## 
## Call:
## ses(y = deseas_totalper_train, h = n_for, holdout = FALSE, silent = FALSE)
## 
##   Smoothing parameters:
##     alpha = 0.8211 
## 
##   Initial states:
##     l = 8.589 
## 
##   sigma:  0.3891
## 
##      AIC     AICc      BIC 
## 1005.810 1005.901 1016.606 
## 
## Error measures:
##                       ME      RMSE       MAE        MPE     MAPE      MASE
## Training set -0.01630259 0.3876277 0.2960287 -0.4932815 4.548733 0.4949799
##                   ACF1
## Training set 0.0107995
## 
## Forecasts:
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Oct 2023       4.974593 4.475978 5.473208 4.212027 5.737159
## Nov 2023       4.974593 4.329420 5.619767 3.987885 5.961301
## Dec 2023       4.974593 4.210472 5.738715 3.805971 6.143216
## Jan 2024       4.974593 4.107694 5.841492 3.648786 6.300401
## Feb 2024       4.974593 4.015872 5.933314 3.508356 6.440830
## Mar 2024       4.974593 3.932107 6.017080 3.380248 6.568939
## Apr 2024       4.974593 3.854589 6.094598 3.261694 6.687492
## May 2024       4.974593 3.782099 6.167087 3.150831 6.798356
## Jun 2024       4.974593 3.713770 6.235416 3.046331 6.902856
## Jul 2024       4.974593 3.648959 6.300228 2.947210 7.001976
## Aug 2024       4.974593 3.587172 6.362015 2.852715 7.096472
## Sep 2024       4.974593 3.528021 6.421165 2.762252 7.186934
autoplot(SES_deseas_totalper)

checkresiduals(SES_deseas_totalper) # Residuals are not iid. 

## 
##  Ljung-Box test
## 
## data:  Residuals from Simple exponential smoothing
## Q* = 45.154, df = 24, p-value = 0.005586
## 
## Model df: 0.   Total lags used: 24
# SARIMA Model
SARIMA_totalper <- auto.arima(ts_Peru_train[,"Total.Per"])
print(SARIMA_totalper)
## Series: ts_Peru_train[, "Total.Per"] 
## ARIMA(3,0,4)(0,1,2)[12] with drift 
## 
## Coefficients:
##          ar1      ar2     ar3      ma1     ma2      ma3     ma4     sma1
##       2.2233  -1.8081  0.5245  -1.4075  0.7380  -0.2152  0.2613  -0.6534
## s.e.  0.1584   0.2932  0.1466   0.1642  0.1937   0.1047  0.0837   0.0745
##          sma2    drift
##       -0.1570  -0.0160
## s.e.   0.0754   0.0031
## 
## sigma^2 = 0.1511:  log likelihood = -125.09
## AIC=272.18   AICc=273.25   BIC=311.26
SARIMA_forecast_totalper <- forecast(object = SARIMA_totalper, h=n_for)
autoplot(SARIMA_forecast_totalper)

checkresiduals(SARIMA_forecast_totalper) # Residuals are iid.

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(3,0,4)(0,1,2)[12] with drift
## Q* = 16.411, df = 15, p-value = 0.3553
## 
## Model df: 9.   Total lags used: 24
# Deaseasoned ARIMA Model
ARIMA_totalper <- auto.arima(deseas_totalper_train, max.D = 0, 
                             max.P = 0, max.Q = 0)
print(ARIMA_totalper)
## Series: deseas_totalper_train 
## ARIMA(4,1,0) 
## 
## Coefficients:
##           ar1      ar2      ar3      ar4
##       -0.1889  -0.0478  -0.2786  -0.1130
## s.e.   0.0608   0.0594   0.0595   0.0613
## 
## sigma^2 = 0.1411:  log likelihood = -116.43
## AIC=242.87   AICc=243.1   BIC=260.84
ARIMA_forecast_totalper <- forecast(object = ARIMA_totalper, h=n_for)
autoplot(ARIMA_forecast_totalper)

checkresiduals(ARIMA_forecast_totalper) # Residuals are iid.

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(4,1,0)
## Q* = 18.559, df = 20, p-value = 0.5506
## 
## Model df: 4.   Total lags used: 24
# STL + ETS Model
ETS_totalper <-  stlf(ts_Peru_train[,"Total.Per"],h=n_for)
autoplot(ETS_totalper) 

checkresiduals(ETS_totalper) # Residuals are not iid. 

## 
##  Ljung-Box test
## 
## data:  Residuals from STL +  ETS(A,N,N)
## Q* = 52.383, df = 24, p-value = 0.0006971
## 
## Model df: 0.   Total lags used: 24
# ARIMA + FOURIER Model
ARIMA_Four_fit_totalper <- auto.arima(ts_Peru_train[,"Total.Per"], 
                             seasonal=FALSE, lambda=0,
                             xreg=fourier(ts_Peru_train[,"Total.Per"], 
                                          K=3))

ARIMA_Four_for_totalper <- forecast(ARIMA_Four_fit_totalper,
                           xreg=fourier(ts_Peru_train[,"Total.Per"],
                                        K=3, h=n_for),
                           h=n_for) 

autoplot(ARIMA_Four_for_totalper)

checkresiduals(ARIMA_Four_for_totalper) # Better fit

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(3,1,2) errors
## Q* = 18.424, df = 19, p-value = 0.4943
## 
## Model df: 5.   Total lags used: 24
# TBATS Model 
TBATS_fit_totalper <- tbats(ts_Peru_train[,"Total.Per"])
TBATS_for_totalper <- forecast(TBATS_fit_totalper, h = n_for)
autoplot(TBATS_for_totalper) 

checkresiduals(TBATS_fit_totalper) # Better fit

## 
##  Ljung-Box test
## 
## data:  Residuals from TBATS
## Q* = 21.678, df = 24, p-value = 0.5985
## 
## Model df: 0.   Total lags used: 24
# Neural Network Model 
NN_fit_totalper <- nnetar(ts_Peru_train[,"Total.Per"],
                          p=3, P=0,
                          xreg=fourier(ts_Peru_train[,"Total.Per"], K=3))

NN_for_totalper <- forecast(NN_fit_totalper, 
                   h=n_for,
                   xreg=fourier(ts_Peru_train[,"Total.Per"], 
                                          K=3,h=n_for))

autoplot(NN_for_totalper)

checkresiduals(NN_fit_totalper) # Residuals are not iid.

## 
##  Ljung-Box test
## 
## data:  Residuals from NNAR(3,5)
## Q* = 45.116, df = 24, p-value = 0.005644
## 
## Model df: 0.   Total lags used: 24
## State Space Exponential Smoothing Model
SSES_seas_totalper <- es(ts_Peru_train[,"Total.Per"],
                         model="ZZZ", h=n_for, holdout=FALSE)

plot(SSES_seas_totalper$fitted, type = "l", col = "blue", main = "Fitted Values from SSES Model")
lines(SSES_seas_totalper$forecast, col = "red")

checkresiduals(SSES_seas_totalper) # Residuals are not iid.

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 45.384, df = 24, p-value = 0.005245
## 
## Model df: 0.   Total lags used: 24
## State Space with BSM Model
SS_seas_totalper <- StructTS(ts_Peru_train[,"Total.Per"],
                    type="BSM",fixed=c(0.01,0.001,0.1,NA)) 

SS_for_totalper <- forecast(SS_seas_totalper,h=n_for)

plot(SS_for_totalper)

checkresiduals(SS_seas_totalper) # Residuals are not iid. 

## 
##  Ljung-Box test
## 
## data:  Residuals from StructTS
## Q* = 238.03, df = 24, p-value < 2.2e-16
## 
## Model df: 0.   Total lags used: 24

Step 4: Performance check for Peru

# Check accuracy of the models
SANIVE_tpscores <- accuracy(SNAIVE_deseas_totalper$mean,ts_Peru_test[,"Total.Per"])  
SMA_tpscores <- accuracy(SMA_deseas_totalper$forecast,ts_Peru_test[,"Total.Per"])  
SES_tpscores <- accuracy(SES_deseas_totalper$mean,ts_Peru_test[,"Total.Per"])
SARIMA_tpscores <- accuracy(SARIMA_forecast_totalper$mean,ts_Peru_test[,"Total.Per"])
ARIMA_tpscores <- accuracy(ARIMA_forecast_totalper$mean,ts_Peru_test[,"Total.Per"])
ETS_tpscores <- accuracy(ETS_totalper$mean,ts_Peru_test[,"Total.Per"])
ARIMA_Four_tpscores <- accuracy(ARIMA_Four_for_totalper$mean,ts_Peru_test[,"Total.Per"])
TBATS_tpscores <- accuracy(TBATS_for_totalper$mean,ts_Peru_test[,"Total.Per"])
NN_tpscores <- accuracy(NN_for_totalper$mean,ts_Peru_test[,"Total.Per"])
SSES_tpscores <- accuracy(SSES_seas_totalper$forecast,ts_Peru_test[,"Total.Per"])
SS_tpscores <- accuracy(SS_for_totalper$mean,ts_Peru_test[,"Total.Per"])

# Compare the matrix 
tpscores <- as.data.frame(rbind(SANIVE_tpscores, SMA_tpscores, 
                                SES_tpscores, SARIMA_tpscores, ARIMA_tpscores, 
                                ETS_tpscores, ARIMA_Four_tpscores, TBATS_tpscores, 
                                NN_tpscores, SSES_tpscores, SS_tpscores)) %>%
  mutate(Average = rowMeans(., na.rm = TRUE))

row.names(tpscores) <- c("SNAIVE", "SMA", "SES", "SARIMA", "ARIMA",
                       "ETS", "ARIMA_FOURIER", "TBATS", "NNETAR",
                       "SSES", "BSM")

# Choose model with lowest error
best_model_index_tp <- which.min(tpscores[,"Average"])
cat("The best model by Average is:", row.names(tpscores[best_model_index_tp,]))  
## The best model by Average is: BSM
# Create Tables 
kbl(tpscores, 
      caption = "Forecast Accuracy for Unemployment Rate (%) Data",
      digits = array(5,ncol(tpscores))) %>%
  kable_styling(full_width = FALSE, position = "center") %>%
  #highlight model with lowest RMSE
  kable_styling(latex_options="striped", stripe_index = which.min(tpscores[,"Average"]))
Forecast Accuracy for Unemployment Rate (%) Data
ME RMSE MAE MPE MAPE ACF1 Theil’s U Average
SNAIVE 0.64167 0.76974 0.69167 12.11799 13.16863 -0.01233 0.84418 4.03165
SMA 0.34691 0.78067 0.58421 5.00053 10.19900 0.22688 0.91989 2.57973
SES 0.31707 0.76788 0.57923 4.42730 10.15997 0.22688 0.90374 2.48315
SARIMA 0.52882 0.64457 0.52882 9.57292 9.57292 -0.27866 0.77381 3.04903
ARIMA 0.40318 0.80325 0.59973 6.09311 10.41354 0.20220 0.94873 2.78053
ETS 0.36756 0.52866 0.39184 6.52143 7.06096 -0.35963 0.64251 2.16476
ARIMA_FOURIER 0.52353 0.71506 0.56808 9.06052 10.05054 -0.28342 0.86529 3.07137
TBATS 0.44151 0.64266 0.50814 7.70751 9.18814 -0.39919 0.77704 2.69512
NNETAR 0.45053 0.59990 0.47009 8.08164 8.51621 -0.13512 0.71698 2.67146
SSES 0.31979 0.55243 0.42212 5.37303 7.63010 -0.41969 0.66813 2.07799
BSM -0.17471 0.42623 0.32943 -3.58676 6.36642 -0.07895 0.49635 0.53972
# Plot everything together
autoplot(ts_Peru_test[,"Total.Per"]) +
  autolayer(SNAIVE_deseas_totalper, PI=FALSE, series="SNAIVE") + 
  autolayer(SES_deseas_totalper, PI=FALSE, series="SES") +
  autolayer(SARIMA_forecast_totalper, PI=FALSE, series="SARIMA") +
  autolayer(ARIMA_forecast_totalper, PI=FALSE, series="ARIMA") +
  autolayer(ETS_totalper, PI=FALSE, series="ETS") +
  autolayer(ARIMA_Four_for_totalper, PI=FALSE, series="ARIMA_FOURIER") +
  autolayer(TBATS_for_totalper, PI=FALSE, series="TBATS") +
  autolayer(NN_for_totalper, PI=FALSE, series="NNETAR") +
  autolayer(SS_for_totalper, PI=FALSE, series="BSM") +
  guides(colour=guide_legend(title="Forecast")) # SMA and SSES could not run

Step 5: Forecast for 2025 with the best three models for Peru

# State Space with BSM Model 
n_full = 15

# Create the time series to retain full data set
ts_Peru_fulltrain <- ts(Peru[,6],
              start=c(year(Peru$Month[1]), month(Peru$Month[1])),
              frequency = 12)

# Fit SSES Model 
SSES_seas_totalper_fulltrain <- es(ts_Peru_fulltrain,
                         model="ZZZ", h=n_full, holdout=FALSE)

SSES_for_totalper_fulltrain <- forecast(SSES_seas_totalper_fulltrain, h=n_full)

# Plot model + observed data
autoplot(ts_Peru_fulltrain) +
  autolayer(SSES_for_totalper_fulltrain, series="SSES",PI=FALSE)+
  ylab("Forecasted Unemployment Rate (%) in Peru") 

# Fit ETS Model 
ETS_seas_totalper_fulltrain <-  stlf(ts_Peru_fulltrain,h=n_full)
ETS_for_totalper_fulltrain <- forecast(ETS_seas_totalper_fulltrain, h=n_full)

# Plot model + observed data
autoplot(ts_Peru_fulltrain) +
  autolayer(ETS_for_totalper_fulltrain, series="ETS",PI=FALSE)+
  ylab("Forecasted Unemployment Rate (%) in Peru")

# Fit SS with BSM Model 
SS_seas_totalper_fulltrain <- StructTS(ts_Peru_fulltrain,
                    type="BSM",fixed=c(0.01,0.001,0.1,NA)) 

SS_for_totalper_fulltrain <- forecast(SS_seas_totalper_fulltrain,h=n_full)

# Plot model + observed data
autoplot(ts_Peru_fulltrain) +
  autolayer(SS_for_totalper_fulltrain, series="SS with BSM Model",PI=FALSE)+
  ylab("Forecasted Unemployment Rate (%) in Peru") 

# Plot 3 models together 
autoplot(ts_Peru_fulltrain) +
  autolayer(SSES_for_totalper_fulltrain, series="SSES",PI=FALSE)+
  autolayer(ETS_for_totalper_fulltrain, series="ETS",PI=FALSE)+
  autolayer(SS_for_totalper_fulltrain, series="SS with BSM Model",PI=FALSE)+
  ylab("Unemployment Rate (%)") + 
  ggtitle("Forecasted Unemployment Rate (%) in Peru")

Step 6: The average of 3 forecasts

# --- 1. Extract predicted means ---
bsm_fc <- as.numeric(SS_for_totalper_fulltrain$mean)
sses_fc <- as.numeric(SSES_for_totalper_fulltrain$mean)
ets_fc <- as.numeric(ETS_for_totalper_fulltrain$mean)

# Compute average forecast
avg_fc <- (bsm_fc + sses_fc + ets_fc) / 3

# --- 2. Create forecast date index ---
# 마지막 월 기준 + 1개월
start_month <- as.Date("2025-01-01")  # 또는 Peru$Month의 마지막 날짜 이후
forecast_dates <- seq(from = start_month, by = "month", length.out = n_full)

# --- 3. Create forecast dataframe ---
forecast_df <- tibble(
  Month = forecast_dates,
  BSM = bsm_fc,
  SSES = sses_fc,
  ETS = ets_fc,
  Avg_Forecast = avg_fc
)

# --- 4. Export to Excel ---
write.xlsx(forecast_df, "../Forecast_Average_Peru.xlsx")

# --- 5. Extract actuals from 2020 ---
peru_actual_2020on <- Peru %>%
  filter(Month >= as.Date("2020-01-01")) %>%
  select(Month, Actual = Total.Per)

# Combine with forecast
plot_df <- bind_rows(
  tibble(Month = peru_actual_2020on$Month,
         Value = peru_actual_2020on$Actual,
         Type = "Actual"),
  tibble(Month = forecast_dates,
         Value = avg_fc,
         Type = "Avg Forecast")
)

# --- 6. Plot actuals + forecast ---
ggplot(plot_df, aes(x = Month, y = Value, color = Type, group = 1)) +
  geom_line(size = 1) +
  scale_color_manual(values = c("Actual" = "black", "Avg Forecast" = "steelblue")) +
  labs(
    title = "Peru Unemployment Rate: Actual (2020~) + Avg Forecast (2025)",
    y = "Unemployment Rate (%)",
    x = "Month",
    color = "Legend"
  ) +
  theme_minimal()

# --- 7. Extract actuals from 2022 ---
peru_actual_2022on <- Peru %>%
  filter(Month >= as.Date("2022-01-01")) %>%
  select(Month, Actual = Total.Per)

# Combine with forecast
plot_df <- bind_rows(
  tibble(Month = peru_actual_2022on$Month,
         Value = peru_actual_2022on$Actual,
         Type = "Actual"),
  tibble(Month = forecast_dates,
         Value = avg_fc,
         Type = "Avg Forecast")
)

# --- 8. Plot actuals + forecast ---
ggplot(plot_df, aes(x = Month, y = Value, color = Type, group = 1)) +
  geom_line(size = 1) +
  scale_color_manual(values = c("Actual" = "black", "Avg Forecast" = "steelblue")) +
  labs(
    title = "Peru Unemployment Rate: Actual (2022~) + Avg Forecast (2025)",
    y = "Unemployment Rate (%)",
    x = "Month",
    color = "Legend"
  ) +
  theme_minimal()

Time Series Analysis for US (Original Series)

Step 1: Transform into time series and set training and testing windows for US (Original)

# Transform into time series
ts_US <- ts(US[,2:11],
              start=c(year(US$Month[1]), month(US$Month[1])),
              frequency = 12)

# Set the period
nobsUS = nrow(US)
n_forUS = 12

# Create a subset for training purpose 
ts_US_train <- ts(US[1:(nobsUS-n_forUS),2:11],
                    start=c(year(US$Month[1]), month(US$Month[1])),
                    frequency = 12)

# Create a subset for testing purpose
start_rowUS = nobsUS - n_forUS + 1
ts_US_test <- ts(US[(nobsUS - n_forUS + 1):nobsUS,2:11],
                   start=c(year(US$Month[start_rowUS]),
                           month(US$Month[start_rowUS])), frequency = 12)

# Plots 
trainUS <- autoplot(ts_US_train[,"Total.Per"]) + ylab("Unemployment Rate (%)") +
  ggtitle("Training Window")
testUS <- autoplot(ts_US_test[,"Total.Per"]) + ylab("Unemployment Rate (%)") +
  ggtitle("Testing Window")
grid.arrange(trainUS, testUS, ncol = 2)

par(mfrow=c(1,2))
Acf(ts_US_train[,"Total.Per"], lag=40, plot = TRUE, main = "")
Pacf(ts_US_train[,"Total.Per"], lag=40, plot = TRUE, main = "")

par(mfrow=c(1,1))

Step 2: Decompose the time series for US (Original)

# Decompose 
decom_totalper_trainUS <- decompose(ts_US_train[,"Total.Per"])
plot(decom_totalper_trainUS)

# Deseason 
deseas_totalper_trainUS <- seasadj(decom_totalper_trainUS)  
plot(deseas_totalper_trainUS)

# Run the tests on deseasoned series
print(adf.test(deseas_totalper_trainUS, alternative = "stationary")) # It is stationary. 
## 
##  Augmented Dickey-Fuller Test
## 
## data:  deseas_totalper_trainUS
## Dickey-Fuller = -2.3519, Lag order = 6, p-value = 0.4278
## alternative hypothesis: stationary
summary(MannKendall(deseas_totalper_trainUS)) # It has a decreasing trend.
## Score =  -10342 , Var(Score) = 2348637
## denominator =  37927.99
## tau = -0.273, 2-sided pvalue =1.5023e-11
# Run the tests on original series 
print(adf.test(ts_US_train[,"Total.Per"], alternative = "stationary")) # It is stationary. 
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_US_train[, "Total.Per"]
## Dickey-Fuller = -2.3553, Lag order = 6, p-value = 0.4264
## alternative hypothesis: stationary
summary(SeasonalMannKendall(ts_US_train[,"Total.Per"])) 
## Score =  -876 , Var(Score) = 17157.33
## denominator =  3013.905
## tau = -0.291, 2-sided pvalue =2.2665e-11
summary(smk.test(ts_US_train[,"Total.Per"])) # It has seasonality. 
## 
##  Seasonal Mann-Kendall trend test (Hirsch-Slack test)
## 
## data: ts_US_train[, "Total.Per"]
## alternative hypothesis: two.sided
## 
## Statistics for individual seasons
## 
## H0
##                       S   varS    tau      z  Pr(>|z|)   
## Season 1:   S = 0   -77 1429.7 -0.307 -2.010 0.0444311  *
## Season 2:   S = 0   -81 1429.7 -0.323 -2.116 0.0343627  *
## Season 3:   S = 0   -81 1429.0 -0.323 -2.116 0.0343207  *
## Season 4:   S = 0   -55 1431.7 -0.218 -1.427 0.1535337   
## Season 5:   S = 0   -51 1427.0 -0.204 -1.324 0.1856346   
## Season 6:   S = 0   -59 1429.0 -0.235 -1.534 0.1249545   
## Season 7:   S = 0   -58 1430.7 -0.231 -1.507 0.1318174   
## Season 8:   S = 0   -61 1429.7 -0.243 -1.587 0.1125483   
## Season 9:   S = 0   -72 1430.7 -0.286 -1.877 0.0605034  .
## Season 10:   S = 0  -81 1433.7 -0.320 -2.113 0.0346148  *
## Season 11:   S = 0  -97 1429.7 -0.386 -2.539 0.0111186  *
## Season 12:   S = 0 -103 1427.0 -0.412 -2.700 0.0069308 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Check for any differencing needed 
print(ndiffs(ts_US_train[,"Total.Per"]))
## [1] 1
print(ndiffs(deseas_totalper_trainUS))
## [1] 1

Step 3: Test Time Series Models for US (Original)

# Seasonal Naive Model 
SNAIVE_deseas_totalperUS <- snaive(ts_US_train[,"Total.Per"], h=n_forUS)
autoplot(SNAIVE_deseas_totalperUS)

checkresiduals(SNAIVE_deseas_totalperUS) # Residuals are not iid. 

## 
##  Ljung-Box test
## 
## data:  Residuals from Seasonal naive method
## Q* = 691.48, df = 24, p-value < 2.2e-16
## 
## Model df: 0.   Total lags used: 24
# Simple Moving Average Model
SMA_deseas_totalperUS <- smooth::sma(y = deseas_totalper_trainUS, h=n_forUS, 
                                     holdout = FALSE, silent = FALSE) 
## Order 1 - 564.0677; Order 100 - 1176.5245; Order 200 - 1165.6369
## Order 1 - 564.0677; Order 50 - 1116.0115; Order 100 - 1176.5245
## Order 1 - 564.0677; Order 25 - 1011.42; Order 50 - 1116.0115
## Order 1 - 564.0677; Order 13 - 890.01; Order 25 - 1011.42
## Order 1 - 564.0677; Order 7 - 797.6525; Order 13 - 890.01
## Order 1 - 564.0677; Order 4 - 721.5871; Order 7 - 797.6525
## Order 1 - 564.0677; Order 2 - 631.7024; Order 4 - 721.5871
## Order 12 - 876.6067

summary(SMA_deseas_totalperUS)
## 
## Model estimated using sma() function: SMA(1)
## Response variable: y
## Distribution used in the estimation: Normal
## Loss function type: MSE; Loss function value: 0.4487
## All coefficients were provided
## Error standard deviation: 0.6711
## Sample size: 276
## Number of estimated parameters: 1
## Number of degrees of freedom: 275
## Information criteria:
##      AIC     AICc      BIC     BICc 
## 564.0531 564.0677 567.6735 567.7145
checkresiduals(SMA_deseas_totalperUS) # Residuals are iid. 

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 10.428, df = 24, p-value = 0.9925
## 
## Model df: 0.   Total lags used: 24
# Simple Exponential Smoothing Model
SES_deseas_totalperUS = ses( y = deseas_totalper_trainUS, h=n_forUS, 
                             holdout = FALSE, silent = FALSE)  
summary(SES_deseas_totalperUS)
## 
## Forecast method: Simple exponential smoothing
## 
## Model Information:
## Simple exponential smoothing 
## 
## Call:
## ses(y = deseas_totalper_trainUS, h = n_forUS, holdout = FALSE, 
##     silent = FALSE)
## 
##   Smoothing parameters:
##     alpha = 0.9999 
## 
##   Initial states:
##     l = 4.3496 
## 
##   sigma:  0.6723
## 
##      AIC     AICc      BIC 
## 1336.031 1336.119 1346.892 
## 
## Error measures:
##                        ME      RMSE      MAE        MPE     MAPE      MASE
## Training set -0.001830183 0.6698351 0.228668 -0.3309908 3.598532 0.2056142
##                    ACF1
## Training set 0.02778021
## 
## Forecasts:
##          Point Forecast     Lo 80    Hi 80      Lo 95    Hi 95
## Jan 2024       3.844512 2.9829569 4.706068  2.5268770 5.162148
## Feb 2024       3.844512 2.6261500 5.062875  1.9811878 5.707837
## Mar 2024       3.844512 2.3523542 5.336671  1.5624533 6.126572
## Apr 2024       3.844512 2.1215308 5.567494  1.2094395 6.479585
## May 2024       3.844512 1.9181701 5.770855  0.8984261 6.790599
## Jun 2024       3.844512 1.7343172 5.954708  0.6172473 7.071778
## Jul 2024       3.844512 1.5652465 6.123778  0.3586760 7.330349
## Aug 2024       3.844512 1.4078790 6.281146  0.1180032 7.571022
## Sep 2024       3.844512 1.2600760 6.428949 -0.1080420 7.797067
## Oct 2024       3.844512 1.1202803 6.568745 -0.3218411 8.010866
## Nov 2024       3.844512 0.9873162 6.701709 -0.5251921 8.214217
## Dec 2024       3.844512 0.8602706 6.828754 -0.7194916 8.408516
autoplot(SES_deseas_totalperUS)

checkresiduals(SES_deseas_totalperUS) # Residuals are iid. 

## 
##  Ljung-Box test
## 
## data:  Residuals from Simple exponential smoothing
## Q* = 10.43, df = 24, p-value = 0.9925
## 
## Model df: 0.   Total lags used: 24
# SARIMA Model
SARIMA_totalperUS <- auto.arima(ts_US_train[,"Total.Per"])
print(SARIMA_totalperUS)
## Series: ts_US_train[, "Total.Per"] 
## ARIMA(0,1,0) 
## 
## sigma^2 = 0.5215:  log likelihood = -300.68
## AIC=603.35   AICc=603.37   BIC=606.97
SARIMA_forecast_totalperUS <- forecast(object = SARIMA_totalperUS, h=n_forUS)
autoplot(SARIMA_forecast_totalperUS)

checkresiduals(SARIMA_forecast_totalperUS) # Residuals are iid.

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0)
## Q* = 22.736, df = 24, p-value = 0.5354
## 
## Model df: 0.   Total lags used: 24
# Deaseasoned ARIMA Model
ARIMA_totalperUS <- auto.arima(deseas_totalper_trainUS, max.D = 0, 
                               max.P = 0, max.Q = 0)
print(ARIMA_totalperUS)
## Series: deseas_totalper_trainUS 
## ARIMA(0,1,0) 
## 
## sigma^2 = 0.4503:  log likelihood = -280.51
## AIC=563.01   AICc=563.03   BIC=566.63
ARIMA_forecast_totalperUS <- forecast(object = ARIMA_totalperUS, h=n_forUS)
autoplot(ARIMA_forecast_totalperUS)

checkresiduals(ARIMA_forecast_totalperUS) # Residuals are iid.

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0)
## Q* = 10.428, df = 24, p-value = 0.9925
## 
## Model df: 0.   Total lags used: 24
# STL + ETS Model
ETS_totalperUS <-  stlf(ts_US_train[,"Total.Per"],h=n_forUS)
autoplot(ETS_totalperUS) 

checkresiduals(ETS_totalperUS) # Residuals are iid. 

## 
##  Ljung-Box test
## 
## data:  Residuals from STL +  ETS(A,N,N)
## Q* = 32.789, df = 24, p-value = 0.1086
## 
## Model df: 0.   Total lags used: 24
# ARIMA + FOURIER Model
ARIMA_Four_fit_totalperUS <- auto.arima(ts_US_train[,"Total.Per"], 
                             seasonal=FALSE, lambda=0,
                             xreg=fourier(ts_US_train[,"Total.Per"], 
                                          K=3))

ARIMA_Four_for_totalperUS <- forecast(ARIMA_Four_fit_totalperUS,
                           xreg=fourier(ts_US_train[,"Total.Per"],
                                        K=3, h=n_forUS),
                           h=n_forUS) 

autoplot(ARIMA_Four_for_totalperUS)

checkresiduals(ARIMA_Four_for_totalperUS) # Residuals are iid. 

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(0,1,0) errors
## Q* = 18.547, df = 24, p-value = 0.7757
## 
## Model df: 0.   Total lags used: 24
# TBATS Model 
TBATS_fit_totalperUS <- tbats(ts_US_train[,"Total.Per"])
TBATS_for_totalperUS <- forecast(TBATS_fit_totalperUS, h = n_forUS)
autoplot(TBATS_for_totalperUS) 

checkresiduals(TBATS_fit_totalperUS) # Residuals are iid. 

## 
##  Ljung-Box test
## 
## data:  Residuals from TBATS
## Q* = 13.35, df = 24, p-value = 0.96
## 
## Model df: 0.   Total lags used: 24
# Neural Network Model 
NN_fit_totalperUS <- nnetar(ts_US_train[,"Total.Per"],
                 p=3, P=0,
                 xreg=fourier(ts_US_train[,"Total.Per"], K=3))

NN_for_totalperUS <- forecast(NN_fit_totalperUS, 
                   h=n_forUS,
                   xreg=fourier(ts_US_train[,"Total.Per"], 
                                          K=3,h=n_forUS))

autoplot(NN_for_totalperUS)

checkresiduals(NN_fit_totalperUS) # Residuals are iid. 

## 
##  Ljung-Box test
## 
## data:  Residuals from NNAR(3,5)
## Q* = 11.304, df = 24, p-value = 0.9867
## 
## Model df: 0.   Total lags used: 24
## State Space Exponential Smoothing Model
SSES_seas_totalperUS <- es(ts_US_train[,"Total.Per"],
                         model="ZZZ", h=n_forUS, holdout=FALSE)
checkresiduals(SSES_seas_totalperUS) # Residuals are iid.

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 27.974, df = 24, p-value = 0.2611
## 
## Model df: 0.   Total lags used: 24
## State Space with BSM Model
SS_seas_totalperUS <- StructTS(ts_US_train[,"Total.Per"],
                    type="BSM",fixed=c(0.01,0.001,0.1,NA)) 

SS_for_totalperUS <- forecast(SS_seas_totalperUS,h=n_forUS)

plot(SS_for_totalperUS)

checkresiduals(SS_seas_totalperUS) # Residuals are not iid. 

## 
##  Ljung-Box test
## 
## data:  Residuals from StructTS
## Q* = 253.52, df = 24, p-value < 2.2e-16
## 
## Model df: 0.   Total lags used: 24

Step 4: Performance check for US (Original)

# Check accuracy of the models
SANIVE_tpscoresUS <- accuracy(SNAIVE_deseas_totalperUS$mean,ts_US_test[,"Total.Per"])  
SMA_tpscoresUS <- accuracy(SMA_deseas_totalperUS$forecast,ts_US_test[,"Total.Per"])  
SES_tpscoresUS <- accuracy(SES_deseas_totalperUS$mean,ts_US_test[,"Total.Per"])
SARIMA_tpscoresUS <- accuracy(SARIMA_forecast_totalperUS$mean,ts_US_test[,"Total.Per"])
ARIMA_tpscoresUS <- accuracy(ARIMA_forecast_totalperUS$mean,ts_US_test[,"Total.Per"])
ETS_tpscoresUS <- accuracy(ETS_totalperUS$mean,ts_US_test[,"Total.Per"])
ARIMA_Four_tpscoresUS <- accuracy(ARIMA_Four_for_totalperUS$mean,ts_US_test[,"Total.Per"])
TBATS_tpscoresUS <- accuracy(TBATS_for_totalperUS$mean,ts_US_test[,"Total.Per"])
NN_tpscoresUS <- accuracy(NN_for_totalperUS$mean,ts_US_test[,"Total.Per"])
SSES_tpscoresUS <- accuracy(SSES_seas_totalperUS$forecast,ts_US_test[,"Total.Per"])
SS_tpscoresUS <- accuracy(SS_for_totalperUS$mean,ts_US_test[,"Total.Per"])

# Compare the matrix 
tpscoresUS <- as.data.frame(rbind(SANIVE_tpscoresUS, SMA_tpscoresUS, 
                                SES_tpscoresUS, SARIMA_tpscoresUS, ARIMA_tpscoresUS, 
                                ETS_tpscoresUS, ARIMA_Four_tpscoresUS, TBATS_tpscoresUS, 
                                NN_tpscoresUS, SSES_tpscoresUS, SS_tpscoresUS)) %>%
  mutate(Average = rowMeans(., na.rm = TRUE))

row.names(tpscoresUS) <- c("SNAIVE", "SMA", "SES", "SARIMA", "ARIMA",
                       "ETS", "ARIMA_FOURIER", "TBATS", "NNETAR",
                       "SSES", "BSM")

# Choose model with lowest error
best_model_index_tpUS <- which.min(tpscoresUS[,"Average"])
cat("The best model by Average is:", row.names(tpscoresUS[best_model_index_tpUS,]))  
## The best model by Average is: BSM
# Create Tables 
kbl(tpscoresUS, 
      caption = "Forecast Accuracy for Unemployment Rate (%) Data",
      digits = array(5,ncol(tpscoresUS))) %>%
  kable_styling(full_width = FALSE, position = "center") %>%
  #highlight model with lowest RMSE
  kable_styling(latex_options="striped", stripe_index = which.min(seas_scores[,"Average"]))
Forecast Accuracy for Unemployment Rate (%) Data
ME RMSE MAE MPE MAPE ACF1 Theil’s U Average
SNAIVE 0.38333 0.40620 0.38333 9.46470 9.46470 0.28333 1.35284 3.10549
SMA 0.17216 0.33070 0.26108 3.80965 6.29631 0.44396 1.06312 1.76814
SES 0.17215 0.33070 0.26108 3.80953 6.29625 0.44396 1.06311 1.76811
SARIMA 0.51667 0.58878 0.51667 12.42930 12.42930 0.44396 1.85999 4.11210
ARIMA 0.17216 0.33070 0.26108 3.80965 6.29631 0.44396 1.06312 1.76814
ETS -0.09918 0.54307 0.39400 -2.91283 10.30378 0.55949 1.95874 1.53529
ARIMA_FOURIER 0.40391 0.45420 0.40391 9.80344 9.80344 0.50899 1.46803 3.26370
TBATS 0.32666 0.38544 0.34616 7.88226 8.43937 0.53705 1.26470 2.74024
NNETAR -5.23924 6.77898 5.26090 -128.73043 129.24610 0.83212 22.16527 4.33053
SSES 1.52521 1.71249 1.52948 38.05395 38.15815 0.73123 5.77401 12.49779
BSM -0.06016 0.32935 0.22841 -1.82484 5.93544 0.57974 1.18115 0.90987
# Plot everything together
autoplot(ts_US_test[,"Total.Per"]) +
  autolayer(SNAIVE_deseas_totalperUS, PI=FALSE, series="SNAIVE") + 
  autolayer(SES_deseas_totalperUS, PI=FALSE, series="SES") +
  autolayer(SARIMA_forecast_totalperUS, PI=FALSE, series="SARIMA") +
  autolayer(ARIMA_forecast_totalperUS, PI=FALSE, series="ARIMA") +
  autolayer(ETS_totalperUS, PI=FALSE, series="ETS") +
  autolayer(ARIMA_Four_for_totalperUS, PI=FALSE, series="ARIMA_FOURIER") +
  autolayer(TBATS_for_totalperUS, PI=FALSE, series="TBATS") +
  autolayer(NN_for_totalperUS, PI=FALSE, size=0.7, series="NNETAR") +
  autolayer(SS_for_totalperUS, PI=FALSE, series="BSM") +
  guides(colour=guide_legend(title="Forecast")) # SMA and SSES could not run

Step 5: Forecast for 2025 with the best three models for US (Original)

Jisup: I think that SARIMA model cannot explain COVID. That’s why the model is Forecast method: ARIMA(0,1,0). I tried several times but SARIMA could not make any better.

# Set the forecasting period
n_fullUS = 12

# Create the time series to retain full data set
ts_US_fulltrain <- ts(US[,6],
              start=c(year(US$Month[1]), month(US$Month[1])),
              frequency = 12)

# Fit SS with BSM Model 
SS_seas_totalper_fulltrainUS <- StructTS(ts_US_fulltrain,
                    type="BSM",fixed=c(0.01,0.001,0.1,NA)) 

SS_for_totalper_fulltrainUS <- forecast(SS_seas_totalper_fulltrainUS,h=n_fullUS)

# Plot model + observed data
autoplot(ts_US_fulltrain) +
  autolayer(SS_for_totalper_fulltrainUS, series="SS with BSM Model",PI=FALSE)+
  ylab("Forecasted Unemployment Rate (%) in US") 

# Fit STL + ETS Model
ETS_totalper_fulltrainUS <-  stlf(ts_US_fulltrain,h=n_fullUS)

# Plot model + observed data
autoplot(ts_US_fulltrain) +
  autolayer(ETS_totalper_fulltrainUS, series="ETS",PI=FALSE)+
  ylab("Forecasted Unemployment Rate (%) in US")

# Simple Exponential Smoothing Model
decom_totalper_fulltrainUS <- decompose(ts_US_fulltrain)
deseas_totalper_fulltrainUS <- seasadj(decom_totalper_fulltrainUS)
SES_deseas_totalper_fulltrainUS = ses( y = deseas_totalper_fulltrainUS,
                                       h=n_fullUS,    holdout = FALSE,
                                       silent = FALSE)  

# Plot model + observed data
autoplot(ts_US_fulltrain) +
  autolayer(SES_deseas_totalper_fulltrainUS, series="SES Model",PI=FALSE)+
  ylab("Forecasted Unemployment Rate (%) in US") 

# SARIMA Model
autoplot(ts_US_fulltrain)

SARIMA_totalper_fulltrainUS <- auto.arima(ts_US_fulltrain)
summary(SARIMA_totalper_fulltrainUS)
## Series: ts_US_fulltrain 
## ARIMA(0,1,0) 
## 
## sigma^2 = 0.5044:  log likelihood = -309.03
## AIC=620.07   AICc=620.08   BIC=623.73
## 
## Training set error measures:
##                        ME      RMSE       MAE        MPE     MAPE      MASE
## Training set -0.003108681 0.7089948 0.3309191 -0.4788019 5.554759 0.3062833
##                    ACF1
## Training set 0.04163473
SARIMA_forecast_totalper_fulltrainUS <- forecast(object = SARIMA_totalper_fulltrainUS, h=n_fullUS)
summary(SARIMA_forecast_totalper_fulltrainUS)
## 
## Forecast method: ARIMA(0,1,0)
## 
## Model Information:
## Series: ts_US_fulltrain 
## ARIMA(0,1,0) 
## 
## sigma^2 = 0.5044:  log likelihood = -309.03
## AIC=620.07   AICc=620.08   BIC=623.73
## 
## Error measures:
##                        ME      RMSE       MAE        MPE     MAPE      MASE
## Training set -0.003108681 0.7089948 0.3309191 -0.4788019 5.554759 0.3062833
##                    ACF1
## Training set 0.04163473
## 
## Forecasts:
##          Point Forecast     Lo 80    Hi 80      Lo 95    Hi 95
## Jan 2025            3.8 2.8898050 4.710195  2.4079768 5.192023
## Feb 2025            3.8 2.5127899 5.087210  1.8313820 5.768618
## Mar 2025            3.8 2.2234960 5.376504  1.3889452 6.211055
## Apr 2025            3.8 1.9796099 5.620390  1.0159537 6.584046
## May 2025            3.8 1.7647421 5.835258  0.6873416 6.912658
## Jun 2025            3.8 1.5704866 6.029513  0.3902535 7.209746
## Jul 2025            3.8 1.3918503 6.208150  0.1170529 7.482947
## Aug 2025            3.8 1.2255797 6.374420 -0.1372361 7.737236
## Sep 2025            3.8 1.0694149 6.530585 -0.3760695 7.976069
## Oct 2025            3.8 0.9217106 6.678289 -0.6019638 8.201964
## Nov 2025            3.8 0.7812246 6.818775 -0.8168185 8.416819
## Dec 2025            3.8 0.6469919 6.953008 -1.0221097 8.622110
# Plot model + observed data
autoplot(ts_US_fulltrain) +
  autolayer(SARIMA_forecast_totalper_fulltrainUS, series="SARIMA Model",PI=FALSE)+
  ylab("Forecasted Unemployment Rate (%) in US") 

# Plot 4 models together 
autoplot(ts_US_fulltrain) +
  autolayer(SS_for_totalper_fulltrainUS, series="SS with BSM Model",PI=FALSE)+
  autolayer(ETS_totalper_fulltrainUS, series="ETS",PI=FALSE)+
  autolayer(SES_deseas_totalper_fulltrainUS, series="SES Model",PI=FALSE)+
  autolayer(SARIMA_forecast_totalper_fulltrainUS, series="SARIMA Model",PI=FALSE)+
  ylab("Unemployment Rate (%)") + 
  ggtitle("Forecasted Unemployment Rate (%) in US")

Time Series Analysis for US (Outliers-removed Series)

Step 1: Transform into time series and set training and testing windows for US (Outliers)

# Remove outliers 
ts_USout <- tsclean(ts_US[,"Total.Per"]) 

autoplot(ts_USout, series="Outliers-removed Series") +
  autolayer(ts_US[,"Total.Per"], series="Original Series") +
  ylab("Unemployment Rate (%)") 

# Create a subset for training purpose 
ts_US_trainout <- ts(ts_USout[1:(nobsUS-n_forUS)],
                    start=c(year(US$Month[1]), month(US$Month[1])),
                    frequency = 12)

# Create a subset for testing purpose
ts_US_testout <- ts(ts_USout[(nobsUS - n_forUS + 1):nobsUS],
                   start=c(year(US$Month[start_rowUS]),
                           month(US$Month[start_rowUS])), frequency = 12)

# Plots 
trainUSout <- autoplot(ts_US_trainout) + ylab("Unemployment Rate (%)") +
  ggtitle("Training Window")
testUSout <- autoplot(ts_US_testout) + ylab("Unemployment Rate (%)") +
  ggtitle("Testing Window")
grid.arrange(trainUSout, testUSout, ncol = 2)

par(mfrow=c(1,2))
Acf(ts_US_trainout, lag=40, plot = TRUE, main = "")
Pacf(ts_US_trainout, lag=40, plot = TRUE, main = "")

par(mfrow=c(1,1))

Step 2: Decompose the time series for US (Outliers)

# Decompose 
decom_totalper_trainUSout <- decompose(ts_US_trainout)
plot(decom_totalper_trainUSout)

# Deseason 
deseas_totalper_trainUSout <- seasadj(decom_totalper_trainUSout)  
plot(deseas_totalper_trainUSout)

# Run the tests on deseasoned series
print(adf.test(deseas_totalper_trainUSout, alternative = "stationary")) # It is unit root. 
## 
##  Augmented Dickey-Fuller Test
## 
## data:  deseas_totalper_trainUSout
## Dickey-Fuller = -2.245, Lag order = 6, p-value = 0.4728
## alternative hypothesis: stationary
summary(MannKendall(deseas_totalper_trainUSout)) # It has a decreasing trend.
## Score =  -11564 , Var(Score) = 2348637
## denominator =  37927.99
## tau = -0.305, 2-sided pvalue =4.5209e-14
# Run the tests on original series 
print(adf.test(ts_US_trainout, alternative = "stationary")) # It is unit out. 
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_US_trainout
## Dickey-Fuller = -2.023, Lag order = 6, p-value = 0.5664
## alternative hypothesis: stationary
summary(SeasonalMannKendall(ts_US_trainout)) 
## Score =  -944 , Var(Score) = 17157.33
## denominator =  3013.905
## tau = -0.313, 2-sided pvalue =5.7243e-13
summary(smk.test(ts_US_trainout)) # It has seasonality. 
## 
##  Seasonal Mann-Kendall trend test (Hirsch-Slack test)
## 
## data: ts_US_trainout
## alternative hypothesis: two.sided
## 
## Statistics for individual seasons
## 
## H0
##                       S   varS    tau      z  Pr(>|z|)   
## Season 1:   S = 0   -77 1429.7 -0.307 -2.010 0.0444311  *
## Season 2:   S = 0   -83 1429.7 -0.331 -2.169 0.0301066  *
## Season 3:   S = 0   -81 1429.0 -0.323 -2.116 0.0343207  *
## Season 4:   S = 0   -79 1431.7 -0.313 -2.061 0.0392597  *
## Season 5:   S = 0   -69 1427.0 -0.276 -1.800 0.0718447  .
## Season 6:   S = 0   -71 1429.0 -0.283 -1.852 0.0640620  .
## Season 7:   S = 0   -68 1430.7 -0.270 -1.771 0.0765017  .
## Season 8:   S = 0   -63 1429.7 -0.251 -1.640 0.1010598   
## Season 9:   S = 0   -72 1430.7 -0.286 -1.877 0.0605034  .
## Season 10:   S = 0  -81 1433.7 -0.320 -2.113 0.0346148  *
## Season 11:   S = 0  -97 1429.7 -0.386 -2.539 0.0111186  *
## Season 12:   S = 0 -103 1427.0 -0.412 -2.700 0.0069308 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Check for any differencing needed 
print(ndiffs(ts_US_trainout))
## [1] 1
print(ndiffs(deseas_totalper_trainUSout))
## [1] 1

Step 3: Test Time Series Models for US (Outliers)

# Seasonal Naive Model 
SNAIVE_deseas_totalperUSout <- snaive(ts_US_trainout, h=n_forUS)
autoplot(SNAIVE_deseas_totalperUSout)

checkresiduals(SNAIVE_deseas_totalperUSout)

## 
##  Ljung-Box test
## 
## data:  Residuals from Seasonal naive method
## Q* = 1369.4, df = 24, p-value < 2.2e-16
## 
## Model df: 0.   Total lags used: 24
# Simple Moving Average Model
SMA_deseas_totalperUSout <- smooth::sma(y = deseas_totalper_trainUSout, h=n_forUS, 
                                     holdout = FALSE, silent = FALSE) 
## Order 1 - -89.718; Order 100 - 1121.6598; Order 200 - 1115.5954
## Order 1 - -89.718; Order 50 - 1022.3643; Order 100 - 1121.6598
## Order 1 - -89.718; Order 25 - 854.3343; Order 50 - 1022.3643
## Order 1 - -89.718; Order 13 - 645.0819; Order 25 - 854.3343
## Order 1 - -89.718; Order 7 - 424.4507; Order 13 - 645.0819
## Order 1 - -89.718; Order 4 - 227.5604; Order 7 - 424.4507
## Order 1 - -89.718; Order 2 - 25.8563; Order 4 - 227.5604
## Order 12 - 616.949

summary(SMA_deseas_totalperUSout)
## 
## Model estimated using sma() function: SMA(1)
## Response variable: y
## Distribution used in the estimation: Normal
## Loss function type: MSE; Loss function value: 0.042
## All coefficients were provided
## Error standard deviation: 0.2053
## Sample size: 276
## Number of estimated parameters: 1
## Number of degrees of freedom: 275
## Information criteria:
##      AIC     AICc      BIC     BICc 
## -89.7326 -89.7180 -86.1122 -86.0712
checkresiduals(SMA_deseas_totalperUSout)

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 125.35, df = 24, p-value = 1.11e-15
## 
## Model df: 0.   Total lags used: 24
# Simple Exponential Smoothing Model
SES_deseas_totalperUSout = ses( y = deseas_totalper_trainUSout, h=n_forUS, 
                             holdout = FALSE, silent = FALSE)  
summary(SES_deseas_totalperUSout)
## 
## Forecast method: Simple exponential smoothing
## 
## Model Information:
## Simple exponential smoothing 
## 
## Call:
## ses(y = deseas_totalper_trainUSout, h = n_forUS, holdout = FALSE, 
##     silent = FALSE)
## 
##   Smoothing parameters:
##     alpha = 0.9999 
## 
##   Initial states:
##     l = 4.2968 
## 
##   sigma:  0.2057
## 
##      AIC     AICc      BIC 
## 682.3091 682.3974 693.1703 
## 
## Error measures:
##                        ME      RMSE       MAE        MPE     MAPE      MASE
## Training set -0.002000153 0.2049472 0.1502127 -0.1151749 2.706719 0.1638427
##                   ACF1
## Training set 0.2700784
## 
## Forecasts:
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Jan 2024       3.744812 3.481205 4.008419 3.341659 4.147965
## Feb 2024       3.744812 3.372034 4.117590 3.174697 4.314927
## Mar 2024       3.744812 3.288261 4.201363 3.046578 4.443046
## Apr 2024       3.744812 3.217637 4.271987 2.938567 4.551057
## May 2024       3.744812 3.155415 4.334209 2.843408 4.646216
## Jun 2024       3.744812 3.099162 4.390462 2.757376 4.732248
## Jul 2024       3.744812 3.047432 4.442192 2.678262 4.811362
## Aug 2024       3.744812 2.999283 4.490341 2.604624 4.885000
## Sep 2024       3.744812 2.954060 4.535564 2.535462 4.954162
## Oct 2024       3.744812 2.911288 4.578336 2.470046 5.019578
## Nov 2024       3.744812 2.870605 4.619019 2.407828 5.081796
## Dec 2024       3.744812 2.831733 4.657891 2.348379 5.141245
autoplot(SES_deseas_totalperUSout)

checkresiduals(SES_deseas_totalperUSout)

## 
##  Ljung-Box test
## 
## data:  Residuals from Simple exponential smoothing
## Q* = 124.87, df = 24, p-value = 1.332e-15
## 
## Model df: 0.   Total lags used: 24
# SARIMA Model
SARIMA_totalperUSout <- auto.arima(ts_US_trainout)
print(SARIMA_totalperUSout)
## Series: ts_US_trainout 
## ARIMA(4,1,0)(1,1,1)[12] 
## 
## Coefficients:
##          ar1     ar2     ar3     ar4     sar1     sma1
##       0.1386  0.2472  0.1517  0.0489  -0.0319  -0.8654
## s.e.  0.0614  0.0624  0.0617  0.0624   0.0741   0.0545
## 
## sigma^2 = 0.03982:  log likelihood = 45.01
## AIC=-76.03   AICc=-75.59   BIC=-51.02
SARIMA_forecast_totalperUSout <- forecast(object = SARIMA_totalperUSout, h=n_forUS)
autoplot(SARIMA_forecast_totalperUSout)

checkresiduals(SARIMA_forecast_totalperUSout) # Residuals are not iid.

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(4,1,0)(1,1,1)[12]
## Q* = 36.901, df = 18, p-value = 0.005399
## 
## Model df: 6.   Total lags used: 24
# Deaseasoned ARIMA Model
ARIMA_totalperUSout <- auto.arima(deseas_totalper_trainUSout, max.D = 0, 
                               max.P = 0, max.Q = 0)
print(ARIMA_totalperUSout)
## Series: deseas_totalper_trainUSout 
## ARIMA(1,1,2) 
## 
## Coefficients:
##          ar1      ma1     ma2
##       0.7766  -0.6386  0.1651
## s.e.  0.0781   0.0931  0.0663
## 
## sigma^2 = 0.03524:  log likelihood = 71.16
## AIC=-134.31   AICc=-134.16   BIC=-119.85
ARIMA_forecast_totalperUSout <- forecast(object = ARIMA_totalperUSout, h=n_forUS)
autoplot(ARIMA_forecast_totalperUSout)

checkresiduals(ARIMA_forecast_totalperUSout) # Residuals are iid.

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,2)
## Q* = 33.924, df = 21, p-value = 0.03692
## 
## Model df: 3.   Total lags used: 24
# STL + ETS Model
ETS_totalperUSout <-  stlf(ts_US_trainout,h=n_forUS)
autoplot(ETS_totalperUSout) 

checkresiduals(ETS_totalperUSout) # Residuals are not iid. 

## 
##  Ljung-Box test
## 
## data:  Residuals from STL +  ETS(M,Ad,N)
## Q* = 65.36, df = 24, p-value = 1.078e-05
## 
## Model df: 0.   Total lags used: 24
# ARIMA + FOURIER Model
ARIMA_Four_fit_totalperUSout <- auto.arima(ts_US_trainout, 
                             seasonal=FALSE, lambda=0,
                             xreg=fourier(ts_US_trainout, 
                                          K=3))

ARIMA_Four_for_totalperUSout <- forecast(ARIMA_Four_fit_totalperUSout,
                           xreg=fourier(ts_US_trainout,
                                        K=3, h=n_forUS),
                           h=n_forUS) 

autoplot(ARIMA_Four_for_totalperUSout)

checkresiduals(ARIMA_Four_for_totalperUSout) # Residuals are not iid. 

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(3,1,1) errors
## Q* = 76.526, df = 20, p-value = 1.514e-08
## 
## Model df: 4.   Total lags used: 24
# TBATS Model 
TBATS_fit_totalperUSout <- tbats(ts_US_trainout)
TBATS_for_totalperUSout <- forecast(TBATS_fit_totalperUSout, h = n_forUS)
autoplot(TBATS_for_totalperUSout) 

checkresiduals(TBATS_fit_totalperUSout) # Residuals are not iid. 

## 
##  Ljung-Box test
## 
## data:  Residuals from TBATS
## Q* = 52.491, df = 24, p-value = 0.0006748
## 
## Model df: 0.   Total lags used: 24
# Neural Network Model 
NN_fit_totalperUSout <- nnetar(ts_US_trainout,
                 p=3, P=0,
                 xreg=fourier(ts_US_trainout, K=3))

NN_for_totalperUSout <- forecast(NN_fit_totalperUSout, 
                   h=n_forUS,
                   xreg=fourier(ts_US_trainout, 
                                          K=3,h=n_forUS))

autoplot(NN_for_totalperUSout)

checkresiduals(NN_fit_totalperUSout) # Residuals are not iid. 

## 
##  Ljung-Box test
## 
## data:  Residuals from NNAR(3,5)
## Q* = 58.331, df = 24, p-value = 0.0001095
## 
## Model df: 0.   Total lags used: 24
## State Space Exponential Smoothing Model
SSES_seas_totalperUSout <- es(ts_US_trainout,
                         model="ZZZ", h=n_forUS, holdout=FALSE)
checkresiduals(SSES_seas_totalperUSout) # Residuals are not iid.

## 
##  Ljung-Box test
## 
## data:  Residuals
## Q* = 45.682, df = 24, p-value = 0.004832
## 
## Model df: 0.   Total lags used: 24
## State Space with BSM Model
SS_seas_totalperUSout <- StructTS(ts_US_trainout,
                    type="BSM",fixed=c(0.01,0.001,0.1,NA)) 

SS_for_totalperUSout <- forecast(SS_seas_totalperUSout,h=n_forUS)

plot(SS_for_totalperUSout)

checkresiduals(SS_seas_totalperUSout) # Residuals are not iid. 

## 
##  Ljung-Box test
## 
## data:  Residuals from StructTS
## Q* = 401.87, df = 24, p-value < 2.2e-16
## 
## Model df: 0.   Total lags used: 24

Step 4: Performance check for US (Outliers)

# Check accuracy of the models
SANIVE_tpscoresUSout <- accuracy(SNAIVE_deseas_totalperUSout$mean,ts_US_testout)  
SMA_tpscoresUSout <- accuracy(SMA_deseas_totalperUSout$forecast,ts_US_testout)  
SES_tpscoresUSout <- accuracy(SES_deseas_totalperUSout$mean,ts_US_testout)
SARIMA_tpscoresUSout <- accuracy(SARIMA_forecast_totalperUSout$mean,ts_US_testout)
ARIMA_tpscoresUSout <- accuracy(ARIMA_forecast_totalperUSout$mean,ts_US_testout)
ETS_tpscoresUSout <- accuracy(ETS_totalperUSout$mean,ts_US_testout)
ARIMA_Four_tpscoresUSout <- accuracy(ARIMA_Four_for_totalperUSout$mean,ts_US_testout)
TBATS_tpscoresUSout <- accuracy(TBATS_for_totalperUSout$mean,ts_US_testout)
NN_tpscoresUSout <- accuracy(NN_for_totalperUSout$mean,ts_US_testout)
SSES_tpscoresUSout <- accuracy(SSES_seas_totalperUSout$forecast,ts_US_testout)
SS_tpscoresUSout <- accuracy(SS_for_totalperUSout$mean,ts_US_testout)

# Compare the matrix 
tpscoresUSout <- as.data.frame(rbind(SANIVE_tpscoresUSout, SMA_tpscoresUSout, 
                                SES_tpscoresUSout, SARIMA_tpscoresUSout, ARIMA_tpscoresUSout, 
                                ETS_tpscoresUSout, ARIMA_Four_tpscoresUSout, TBATS_tpscoresUSout, 
                                NN_tpscoresUSout, SSES_tpscoresUSout, SS_tpscoresUSout)) %>%
  mutate(Average = rowMeans(., na.rm = TRUE))

row.names(tpscoresUSout) <- c("SNAIVE", "SMA", "SES", "SARIMA", "ARIMA",
                       "ETS", "ARIMA_FOURIER", "TBATS", "NNETAR",
                       "SSES", "BSM")

# Choose model with lowest error
best_model_index_tpUSout <- which.min(tpscoresUSout[,"Average"])
cat("The best model by Average is:", row.names(tpscoresUSout[best_model_index_tpUSout,]))  
## The best model by Average is: BSM
# Create Tables 
kbl(tpscoresUSout, 
      caption = "Forecast Accuracy for Unemployment Rate (%) Data",
      digits = array(5,ncol(tpscoresUSout))) %>%
  kable_styling(full_width = FALSE, position = "center") %>%
  #highlight model with lowest RMSE
  kable_styling(latex_options="striped", stripe_index = which.min(seas_scores[,"Average"]))
Forecast Accuracy for Unemployment Rate (%) Data
ME RMSE MAE MPE MAPE ACF1 Theil’s U Average
SNAIVE 0.38333 0.40620 0.38333 9.46470 9.46470 0.28333 1.35284 3.10549
SMA 0.27186 0.39196 0.32013 6.30418 7.67176 0.44396 1.24293 2.37811
SES 0.27185 0.39195 0.32013 6.30406 7.67168 0.44396 1.24292 2.37808
SARIMA 0.29883 0.35509 0.30781 7.39350 7.61242 0.58243 1.20095 2.53586
ARIMA 0.35322 0.45308 0.38245 8.34064 9.17576 0.46153 1.44837 2.94501
ETS 0.08226 0.13550 0.10280 2.00098 2.52297 0.12960 0.46055 0.77638
ARIMA_FOURIER 0.50944 0.55013 0.50944 12.48430 12.48430 0.55452 1.81939 4.13022
TBATS 0.44741 0.49729 0.44741 10.95395 10.95395 0.62666 1.65645 3.65473
NNETAR 0.10046 0.14368 0.11184 2.36816 2.67759 0.21744 0.47236 0.87022
SSES 0.41563 0.47631 0.42271 10.27258 10.44526 0.65134 1.60031 3.46916
BSM 0.05951 0.12113 0.10407 1.37766 2.53923 0.04463 0.39014 0.66234
# Plot everything together
autoplot(ts_US_testout) +
  autolayer(SNAIVE_deseas_totalperUSout, PI=FALSE, series="SNAIVE") + 
  autolayer(SES_deseas_totalperUSout, PI=FALSE, series="SES") +
  autolayer(SARIMA_forecast_totalperUSout, PI=FALSE, series="SARIMA") +
  autolayer(ARIMA_forecast_totalperUSout, PI=FALSE, series="ARIMA") +
  autolayer(ETS_totalperUSout, PI=FALSE, series="ETS") +
  autolayer(ARIMA_Four_for_totalperUSout, PI=FALSE, series="ARIMA_FOURIER") +
  autolayer(TBATS_for_totalperUSout, PI=FALSE, series="TBATS") +
  autolayer(NN_for_totalperUSout, PI=FALSE, series="NNETAR") +
  autolayer(SS_for_totalperUSout, PI=FALSE, series="BSM") +
  guides(colour=guide_legend(title="Forecast")) # SMA and SSES could not run

Step 5: Forecast for 2025 with the best three models for US (Outliers)

# Set the forecasting period
n_fullUS = 12

# Create the time series to retain full data set
ts_US_fulltrainout <- ts(ts_USout,
              start=c(year(US$Month[1]), month(US$Month[1])),
              frequency = 12)

# Fit SS with BSM Model 
SS_seas_totalper_fulltrainUSout <- StructTS(ts_US_fulltrainout,
                    type="BSM",fixed=c(0.01,0.001,0.1,NA)) 

SS_for_totalper_fulltrainUSout <- forecast(SS_seas_totalper_fulltrainUSout,h=n_fullUS)

# Plot model + observed data
autoplot(ts_US_fulltrainout) +
  autolayer(SS_for_totalper_fulltrainUSout, series="SS with BSM Model",PI=FALSE)+
  ylab("Forecasted Unemployment Rate (%) in US") 

# Fit Neural Network Model 
NN_fit_totalper_fulltrainUSout <- nnetar(ts_US_fulltrainout,
                 p=3, P=0,
                 xreg=fourier(ts_US_fulltrainout, K=3))

NN_for_totalper_fulltrainUSout <- forecast(NN_fit_totalper_fulltrainUSout, 
                   h=n_fullUS,
                   xreg=fourier(ts_US_fulltrainout, 
                                          K=3,h=n_fullUS))

# Plot model + observed data
autoplot(ts_US_fulltrainout) +
  autolayer(NN_for_totalper_fulltrainUSout, series="NNETAR",PI=FALSE)+
  ylab("Forecasted Unemployment Rate (%) in US")

# Fit STL + ETS Model
ETS_totalper_fulltrainUSout <-  stlf(ts_US_fulltrainout,h=n_fullUS)

# Plot model + observed data
autoplot(ts_US_fulltrainout) +
  autolayer(ETS_totalper_fulltrainUSout, series="ETS",PI=FALSE)+
  ylab("Forecasted Unemployment Rate (%) in US")

# Plot 4 models together 
autoplot(ts_US_fulltrain) +
  autolayer(SS_for_totalper_fulltrainUSout, series="SS with BSM Model",PI=FALSE)+
  autolayer(NN_for_totalper_fulltrainUSout, series="NNETAR",PI=FALSE)+
  autolayer(ETS_totalper_fulltrainUSout, series="ETS",PI=FALSE)+
  ylab("Unemployment Rate (%)") + 
  ggtitle("Forecasted Unemployment Rate (%) in US (Outliers Removed)")

Jisup: I added the average of 3 models above. I could not decide which is the best. :)

# --- 1. Calculate the average forecast of the three models ---
# Extract predicted means
bsm_fc <- as.numeric(SS_for_totalper_fulltrainUSout$mean)
nnar_fc <- as.numeric(NN_for_totalper_fulltrainUSout$mean)
ets_fc <- as.numeric(ETS_totalper_fulltrainUSout$mean)

# Compute average forecast
avg_fc <- (bsm_fc + nnar_fc + ets_fc) / 3


# Create date index for forecast
start_date <- as.Date(paste0(end(US$Month)[1] + 1, "-", end(US$Month)[2], "-01"))  # 1 month after last
forecast_dates <- seq(from = as.Date("2025-01-01"), by = "month", length.out = n_fullUS)

# Create forecast dataframe
forecast_df <- tibble(
  Month = forecast_dates,
  BSM = bsm_fc,
  NNAR = nnar_fc,
  ETS = ets_fc,
  Avg_Forecast = avg_fc
)

# --- 2. Export forecast to Excel ---
write.xlsx(forecast_df, "../Forecast_Average_US.xlsx")

# --- 3. Plot actuals (from 2020) and forecast average ---
# Extract actual values after 2020
us_actual_2020on <- US %>%
  filter(Month >= as.Date("2020-01-01")) %>%
  select(Month, Actual = Total.Per)


# Combine with forecast
plot_df <- bind_rows(
  tibble(Month = us_actual_2020on$Month,
         Value = us_actual_2020on$Actual,
         Type = "Actual"),
  tibble(Month = forecast_dates,
         Value = avg_fc,
         Type = "Avg Forecast")
)

# --- 4. Plot actuals (from 2022) and forecast average ---
# Extract actual values after 2022
us_actual_2022on <- US %>%
  filter(Month >= as.Date("2022-01-01")) %>%
  select(Month, Actual = Total.Per)


# Combine with forecast
plot_df <- bind_rows(
  tibble(Month = us_actual_2022on$Month,
         Value = us_actual_2022on$Actual,
         Type = "Actual"),
  tibble(Month = forecast_dates,
         Value = avg_fc,
         Type = "Avg Forecast")
)


# Plot
ggplot(plot_df, aes(x = Month, y = Value, color = Type, group = 1)) +
  geom_line(size = 1) +
  scale_color_manual(values = c("Actual" = "black", "Avg Forecast" = "steelblue")) +
  labs(
    title = "US Unemployment Rate: Actual (2022~) + Avg Forecast (2025)",
    y = "Unemployment Rate (%)",
    x = "Month",
    color = "Legend"
  ) +
  theme_minimal()